import matplotlib.pyplot as plt

months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
revenue = [52, 74, 79, 95, 115, 110, 129, 126, 147, 146, 156, 184]

plt.plot(months, revenue, "o")
plt.title("Sandra's Lemonade")
plt.xlabel("months")
plt.ylabel("revenue")
plt.show()


m = 10
b = 53


y = [m*x + b for x in months]


plt.plot(months, revenue, "o")
plt.plot(months, y)
plt.show()


#주어진 3개의 point
x = [1, 2, 3]
y = [5, 1, 3]


# y = x 
m1 = 1
b1 = 0


# y = 0.5x + 1
m2 = 0.5
b2 = 1


y_pred_1 = [m1*x_val + b1 for x_val in x]
y_pred_2 = [m2*x_val + b2 for x_val in x]


#result = 0
#for x_val, y_val in zip(x, y):
#  value = m1*x_val +b1
#  value = y_val-value
#  value *= value

total_loss1 = 0
total_loss2 = 0

N = len(x)

for i in range(N):
  total_loss1 += (y[i] - y_pred_1[i])**2/N
  total_loss2 += (y[i] - y_pred_2[i])**2/N
  
print("y = x loss", total_loss1)
print("y = 0.5x + 1 loss", total_loss2)

y = x loss 5.666666666666666
y = 0.5x + 1 loss 4.499999999999999


#intercept에 대하여 gradient descent를 수행하는 함수를 구현해봅니다.
def get_gradient_at_b(x, y, b, m):
  N = len(x)
  diff = 0
  for i in range(N):
    x_val = x[i]
    y_val = y[i]
    diff += y_val - ((m * x_val) + b)
  b_gradient = -(2/N) * diff
  return b_gradient


#intercept에 대하여 gradient descent를 수행하는 함수를 구현해봅니다.
def get_gradient_at_m(x, y, b, m):
  N = len(x)
  diff = 0
  for i in range(N):
    x_val = x[i]
    y_val = y[i]
    diff +=(x_val) *( y_val - ((m * x_val) + b))
  m_gradient = -(2/N) * diff
  return m_gradient


#step_gradient 함수
def step_gradient(b_current, m_current, x, y, learning_rate):
  b_gradient = get_gradient_at_b(x, y, b_current, m_current)
  m_gradient = get_gradient_at_m(x, y, b_current, m_current)

  b = b_current - (learning_rate * b_gradient)
  m = m_current - (learning_rate * m_gradient)

  return [b, m]


months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
revenue = [52, 74, 79, 95, 115, 110, 129, 126, 147, 146, 156, 184]

b = 0
m = 0
learning_rate = 0.01

[b, m] = step_gradient(b, m, months, revenue, learning_rate)
print("b : ", b)
print("m : ", m)

b :  2.355
m :  17.78333333333333


def gradient_descent(x, y, learning_rate, num_iter):
  b = 0.
  m = 0
  for i in range(num_iter):
    [b, m] = step_gradient(b, m, x, y, learning_rate)
  return [b, m]


[optimal_b, optimal_m] = gradient_descent(months, revenue, 0.01, 1000)
print(optimal_b, optimal_m)

49.60215351339813 10.463427732364998


y = [optimal_m * x + optimal_b for x in months]

plt.plot(months, revenue, "o")
plt.plot(months, y)
plt.show()


import matplotlib.pyplot as plt
import numpy as np

temperature = np.array(range(60, 100, 2))
temperature = temperature.reshape(-1, 1)
sales = [65, 58, 46, 45, 44, 42, 40, 40, 36, 38, 38, 28, 30, 22, 27, 25, 25, 20, 15, 5]

plt.plot(temperature, sales, 'o')

[<matplotlib.lines.Line2D at 0x7f12283b24d0>]


from sklearn.linear_model import LinearRegression


lr = LinearRegression()

lr.fit(temperature, sales)
sales_predict = lr.predict(temperature)

plt.plot(temperature,  sales, "o")
plt.plot(temperature,  sales_predict,)
plt.show()


lr.coef_

array([-1.15225564])


lr.intercept_

125.47819548872182


#scikit-learn에서 제공하는 score함수 사용하여 r-square값 구해보기
print("R-squared:")
print(lr.score(temperature, sales))

#lr model sales -> temp 에 대해 아래 확률 만큼 설명 가능하다

R-squared:
0.9114088011031334


import numpy as np
import matplotlib.pyplot as plt

passed_exam = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1])
passed_exam = passed_exam.reshape(-1, 1) 
hours_studied = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
hours_studied = hours_studied.reshape(-1, 1) 

#시험에 패스/페일 vs 공부한 시간에 대한 산점도 그려보기
plt.scatter(hours_studied,  passed_exam)
plt.show()


model = LinearRegression()
model.fit(hours_studied, passed_exam)

LinearRegression()


import numpy as np

sample_x = np.linspace(0, 20,100).reshape(-1,1)
probability = model.predict(sample_x).ravel()


plt.plot(hours_studied,  passed_exam, "o")
plt.plot(sample_x, probability)
plt.show()


calculated_coefficients = 0.03
intercept = -0.3


# log_odds 함수를 정의해봅니다.
def log_odds(features, coefficient, intercept):
  return np.dot(features, coefficient) + intercept


# hours_studied 데이터에 대해서 log-odds를 계산해봅니다.
calculated_log_odds = log_odds(hours_studied, calculated_coefficients, intercept)
calculated_log_odds

array([[-0.3 ],
       [-0.27],
       [-0.24],
       [-0.21],
       [-0.18],
       [-0.15],
       [-0.12],
       [-0.09],
       [-0.06],
       [-0.03],
       [ 0.  ],
       [ 0.03],
       [ 0.06],
       [ 0.09],
       [ 0.12],
       [ 0.15],
       [ 0.18],
       [ 0.21],
       [ 0.24],
       [ 0.27]])


# sigmoid 함수 정의하기
def sigmoid(z):
  return 1/(1+np.exp(-z))


# 확률 계산해보기
probabilities = sigmoid(calculated_log_odds)
probabilities

array([[0.42555748],
       [0.4329071 ],
       [0.44028635],
       [0.44769209],
       [0.45512111],
       [0.46257015],
       [0.47003595],
       [0.47751518],
       [0.4850045 ],
       [0.49250056],
       [0.5       ],
       [0.50749944],
       [0.5149955 ],
       [0.52248482],
       [0.52996405],
       [0.53742985],
       [0.54487889],
       [0.55230791],
       [0.55971365],
       [0.5670929 ]])


# log_loss 함수 구현해보기
def log_loss(probabilities, actual_class):
  return np.sum(-(1 / actual_class.shape[0]) * (actual_class * np.log(probabilities)
   + (1 - actual_class) * np.log(1 - probabilities)))


log_loss(probabilities, passed_exam)

0.6279073897953891


# predict_class함수 구현하기
def predict_class(features, coefficients, intercept, threshold):
  odd = log_odds(features, coefficients, intercept)
  predicted_probability = sigmoid(odd)

  result = []
  for i in predicted_probability:
    if i >= threshold:
      result.append([1])
    else:
      result.append([0])
  return result


# threshold=0.5로 최종 예측 해보기
predict_class(hours_studied, calculated_coefficients, intercept, 0.5)

[[0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1]]


#threshold=0.55로 최종 예측 해보기
predict_class(hours_studied, calculated_coefficients, intercept, 0.55)

[[0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [1],
 [1],
 [1]]


from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(hours_studied, passed_exam)

/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

LogisticRegression()


probability = model.predict_proba(sample_x)[:, 1]

plt.plot(hours_studied, passed_exam, 'o')
plt.plot(sample_x, probability)
plt.xlabel('hours studied')

plt.show()


predicted_class

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# 데이터를 feature X와 label y로 나눕니다.
y = df[['Survived']].to_numpy().ravel()
x = df.drop(columns=['Survived'])


import pandas as pd

def onehot(data, feature):
  '''
  data의 feature column을 one hot으로 변환해줍니다.
  data: pandas DataFrame
  feature: string, 데이터 프레임의 column 이름
  '''
  return pd.concat([data, pd.get_dummies(data[feature], prefix=feature)], axis=1).drop([feature], axis=1)

data_url = "https://raw.githubusercontent.com/inikoreaackr/ml_datasets/main/titanic.csv"


data = pd.read_csv(data_url)


data.columns

data = data [['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]


data = data.dropna()


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.Sex = le.fit_transform(data.Sex)


data.head()


onehot?


data = onehot(data, 'Embarked')


y = data[['Survived']].to_numpy().ravel()
x = data.drop(columns=['Survived'])


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .3)


model_titanic = LogisticRegression(max_iter=1000)
model_titanic.fit(x_train, y_train)

LogisticRegression(max_iter=1000)


print(model_titanic.score(x_test,y_test))

0.7990654205607477


coef = model_titanic.coef_
print(coef, "\n")

for i in range(len(x.columns)):
  print(x.columns[i],":",coef[0][i])

[[-1.13588865e+00 -2.58221815e+00 -4.56647796e-02 -3.35399948e-01
  -4.22408897e-02  2.20349750e-03  4.47231111e-01 -5.75288314e-01
   1.18618995e-01]] 

Pclass : -1.1358886533804295
Sex : -2.5822181505918125
Age : -0.045664779628737426
SibSp : -0.33539994797221
Parch : -0.04224088972835762
Fare : 0.0022034975000360204
Embarked_C : 0.4472311112889102
Embarked_Q : -0.5752883138433935
Embarked_S : 0.11861899501285339


# coefficient gradient : SUM(x(a(i) - y))/m
def get_coef_gradient(x, y, y_predict):
  [[grad]] = (1/len(x)) * np.dot(x.T, y_predict - y)
  return grad


get_coef_gradient(hours_studied, passed_exam, probabilities)

-1.5871075590969737


# intercept gradient : SUM(a(i) - y)/m
def get_intercept_gradient(y, y_predict):
  return (1/len(y)) * np.sum(y_predict - y)


get_intercept_gradient(passed_exam, probabilities)

0.046277874159417066


def log_loss_step_gradient(weight_current, intercept_current, x, y, y_predicted, learning_rate):
  coef = get_coef_gradient (x, y, y_predicted)
  intercept = get_intercept_gradient(y, y_predicted)
  
  weight = weight_current - (learning_rate * coef)
  intercept = intercept_current - (learning_rate * intercept)

  return [weight, intercept]


def log_loss_gradient_descent(x, y, learning_rate, num_iter):
  opt_weight = 0
  opt_intercept = 0  
  odd = log_odds(x, weight, intercept)
  y_predict = sigmoid(odd)
  print(y_predict)
  trace = []
  trace.append(log_loss(y_predict, y))
  for j in range(num_iter):
    [opt_weight, opt_intercept] = log_loss_step_gradient(opt_weight, opt_intercept, x, y, y_predict, learning_rate)
    odd = log_odds(x, opt_weight, opt_intercept)
    y_predict = sigmoid(odd)
    trace.append(log_loss(y_predict, y))
  return [opt_weight, opt_intercept], trace


[weight, intercept], trace = log_loss_gradient_descent(hours_studied, passed_exam, 0.05, 1000)

print("weight : ", weight, "\n")
print("intercept : ", intercept, "\n")

[[0.42555748]
 [0.51673783]
 [0.60681714]
 [0.69017318]
 [0.76276619]
 [0.82271945]
 [0.87010168]
 [0.90626282]
 [0.93313104]
 [0.95269988]
 [0.96674609]
 [0.97672306]
 [0.98375699]
 [0.98868999]
 [0.99213681]
 [0.99453899]
 [0.99621011]
 [0.99737121]
 [0.99817723]
 [0.99873643]]
weight :  0.36710895563627843 

intercept :  -3.6293894250817496


def norm_x(x):
  return x-x.mean()

y_prediction = sigmoid(log_odds(hours_studied, weight, intercept))

plt.plot(hours_studied, passed_exam, "o")
plt.plot(hours_studied, y_prediction)

[<matplotlib.lines.Line2D at 0x7f122808e390>]


# model의 log_loss cost변화
plt.title('Log-loss over iterations')
plt.plot(trace)
plt.xlabel('iteration')
plt.ylabel('Log-loss')
plt.show()

	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	0	3	1	22.0	1	7.2500	S
1	1	1	0	38.0	1	71.2833	C
2	1	3	0	26.0	0	7.9250	S
3	1	1	0	35.0	1	53.1000	S
4	0	3	1	35.0	0	8.0500	S

활동 1: Simple Linear Regression¶

[Example: Sandra’s lemonade stand’s revenue over its first 12 months of being open]¶

[Points and Lines]¶

[Practice 1]¶

Loss¶

[예시]¶

[ Practice 2 ]¶

Gradient Descent for Intercept¶

Gradient Descent for Slope¶

Weight Update¶

Example: Sandra’s lemonade stand’s revenue over its first 12 months of being open¶

Scikit-Learn 라이브러리 사용¶

계수(Coefficients)¶

모델 평가¶

활동 2: Logistic Regression¶

[Linear Regression Approach]¶

[ Log-Odds ]¶

[ Sigmoid Function ]¶

[ Log-Loss ]¶

[ Classification Thresholding ]¶

[ Scikit-Learn ]¶

과제¶

과제 2¶

TAEWON KIM

Recent posts

Comments

활동 1: Simple Linear Regression¶

[Example: Sandra’s lemonade stand’s revenue over its first 12 months of being open]¶

[Points and Lines]¶

[Practice 1]¶

Loss¶

[예시]¶

[ Practice 2 ]¶

Gradient Descent for Intercept¶

Gradient Descent for Slope¶

Weight Update¶

Example: Sandra’s lemonade stand’s revenue over its first 12 months of being open¶

Scikit-Learn 라이브러리 사용¶

계수(Coefficients)¶

모델 평가¶

활동 2: Logistic Regression¶

[Linear Regression Approach]¶

[ Log-Odds ]¶

[ Sigmoid Function ]¶

[ Log-Loss ]¶

[ Classification Thresholding ]¶

[ Scikit-Learn ]¶

과제¶

과제 2¶

TAEWON KIM

Recent posts

Newsletter

Comments