import numpy as np

# 노트북 실행 결과를 동일하게 유지하기 위해
np.random.seed(42)

# 깔끔한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


import urllib.request

file_url = "https://raw.githubusercontent.com/codingalzi/pydata/master/notebooks/pydata06_linear_algebra_basics.py"
file_name = "pydata06_linear_algebra_basics.py"
urllib.request.urlretrieve(file_url, file_name)

('pydata06_linear_algebra_basics.py',
 <http.client.HTTPMessage at 0x7fae25084610>)


import pydata06_linear_algebra_basics as LA


def sum_of_squares(v: LA.Vector) -> float:
    """
    v 벡터에 포함된 원소들의 제곱의 합 계산
    """
    return LA.dot(v, v)


def square(x: float) -> float:
    return x * x


def square_prime(x: float) -> float:
    return 2 * x


from typing import Callable

def difference_quotient(f: Callable[[float], float],
                        x: float,
                        h: float) -> float:
    """
    함수 f의 x에서의 미분값 근사치 계산
    f: 미분 대상 함수
    x: 인자
    h: x가 변하는 정도
    """
    
    return (f(x + h) - f(x)) / h


h = 0.001
xs = range(-10, 11)

actuals = [square_prime(x) for x in xs]
estimates = [difference_quotient(square, x, h) for x in xs]

plt.title("Actual Derivatives vs. Estimates")

# 실제 도함수 그래프(빨간색 직선)
plt.plot(xs, actuals, 'r-', label='square_prime') 

# 근사치 그래프(검은색 점)
plt.plot(xs, estimates, 'k.', label='Estimates')
plt.legend()
plt.show()


def partial_difference_quotient(f: Callable[[LA.Vector], float],
                                v: LA.Vector,
                                i: int,
                                h: float) -> float:
    """
    함수 f의 v에서의 i번째 편미분값 근사치 계산
    f: 편미분 대상 함수
    v: 인자 벡터
    i: i번째 인자를 가리킴
    h: 인자 v_i가 변하는 정도
    """
    
    # v_i에 대해서만 h를 더한 벡터
    w = [v_j + (h if j == i else 0) for j, v_j in enumerate(v)]

    return (f(w) - f(v)) / h


def estimate_gradient(f: Callable[[LA.Vector], float],
                      v: LA.Vector,
                      h: float = 0.0001):
    
    return [partial_difference_quotient(f, v, i, h) for i in range(len(v))]


def sum_of_squares_gradient(v: LA.Vector) -> LA.Vector:
    return [2 * v_i for v_i in v]


import random

def gradient_step(v: LA.Vector, gradient: LA.Vector, learning_rate: float) -> LA.Vector:
    step = LA.scalar_multV(learning_rate, gradient)
    new_V = LA.subtractV(v, step)                    # 그레이디언트의 반대방향으로 이동

    return new_V


random.seed(42)

# 임의로 선택된 출발점 좌표
v = [random.uniform(-10, 10) for i in range(3)]

# gradient_step 1000번 반복
for epoch in range(1000):
    grad = sum_of_squares_gradient(v)  # 그레이디언트 계산
    
    v = gradient_step(v, grad, 0.01)   # 좌표 업데이트
    
    if epoch%100 == 0:
        print(epoch, v)

print("\n----\n")        
print(f"그레이디언트의 최종 값: {grad}")
print(f"v의 최후 위치와 최솟점 사이의 거리: {LA.distance(v, [0, 0, 0])}")

0 [2.732765249774521, -9.309789197635727, -4.409425359965263]
100 [0.3624181137897112, -1.2346601088642208, -0.5847760329896551]
200 [0.048063729299005625, -0.16374007531854073, -0.07755273779298358]
300 [0.006374190434279768, -0.02171513607091833, -0.010285009644527733]
400 [0.0008453423045827667, -0.002879851701919325, -0.0013639934114305214]
500 [0.00011210892101281368, -0.00038192465375128993, -0.00018089220046728499]
600 [1.4867835316559317e-05, -5.065067796575345e-05, -2.3989843290795995e-05]
700 [1.971765716798422e-06, -6.717270417586384e-06, -3.1815223632100903e-06]
800 [2.6149469369030636e-07, -8.908414196052704e-07, -4.2193208287814765e-07]
900 [3.467931014604295e-08, -1.1814299344070243e-07, -5.5956445449048146e-08]

----

그레이디언트의 최종 값: [9.57758165411209e-09, -3.262822016281278e-08, -1.5453808714914104e-08]
v의 최후 위치와 최솟점 사이의 거리: 1.830234305038648e-08


# x는 -0.5에서 0.5 사이
X = [x/100 for x in range(-50, 50)]

# 약간의 잡음 추가 (가우시안 잡음)
error = [random.randrange(-100,100)/100 for _ in range(-50, 50)]

# y = 5 + 20*x + 가우시안 잡음
y = [5 + 20*x + e for x, e in zip(X, error)]

# (x,y) 좌표값들의 리스트
inputs = list(zip(X, y))


plt.plot(X, y, 'k.')
plt.show()


def linear_gradient(x: float, y_x: float, theta: LA.Vector) -> LA.Vector:
    
    intercept, slope = theta           
    predicted = intercept + slope * x  
    error = (predicted - y_x)            
    grad = [2 * error, 2 * error * x]  

    return grad


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(5000):
    # 평균 제곱 오차 계산
    grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in inputs])
    
    # theta 값 업데이트.
    theta = gradient_step(theta, grad, learning_rate)
    
    # 500번에 한 번 학습과정 확인
    if epoch % 500 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.5498750271006548, -0.348065046729531]
500 [2.8916275137318697, 1.2779917858845682]
1000 [4.161340996373953, 2.78438692444515]
1500 [4.63252303459113, 4.174144289893471]
2000 [4.809879153546342, 5.454184661194932]
2500 [4.878918399660604, 6.6323964362145]
3000 [4.907842395609395, 7.716596269085061]
3500 [4.921739999085596, 8.714181270348421]
4000 [4.929854101282013, 9.632032625512535]
4500 [4.935602366135786, 10.476509096849366]
최종 기울기: 11.252
최종 절편: 4.940


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(20000):
    # 평균 제곱 오차 계산 (전체 훈련 데이터 대상)
    grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in inputs])
    
    # theta 값 업데이트. 그레이디언트 반대 방향으로 지정된 학습률 만큼 이동
    theta = gradient_step(theta, grad, learning_rate)
    
    # 1000번에 한 번 학습과정 확인
    if epoch % 1000 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [0.5453216196333258, -0.8846541803415932]
1000 [4.3071961393797, 2.3344318287971526]
2000 [4.827835026420881, 5.073878601499392]
3000 [4.908792115820234, 7.394749506518404]
4000 [4.928733502968913, 9.359603724571762]
5000 [4.939051515502755, 11.022864887660349]
6000 [4.9468993258866565, 12.430800212519703]
7000 [4.953422706517003, 13.62260111714172]
8000 [4.958928503638361, 14.631446280141436]
9000 [4.96358691111091, 15.485421540772457]
10000 [4.967529901914979, 16.208301288794544]
11000 [4.97086755619471, 16.82021026697407]
12000 [4.973692834861904, 17.338183828260746]
13000 [4.976084398412164, 17.776642193387854]
14000 [4.97810882798133, 18.147791905104715]
15000 [4.979822483153499, 18.46196565445812]
16000 [4.9812730715626445, 18.727909939652793]
17000 [4.982500977134955, 18.95302856580554]
18000 [4.98354038437498, 19.14358876454719]
19000 [4.98442023005287, 19.30489567177664]
최종 기울기: 19.441
최종 절편: 4.985


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.01

for epoch in range(5000):
    # 평균 제곱 오차 계산 (전체 훈련 데이터 대상)
    grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in inputs])
    
    # theta 값 업데이트. 그레이디언트 반대 방향으로 지정된 학습률 만큼 이동
    theta = gradient_step(theta, grad, learning_rate)
    
    # 500번에 한 번 학습과정 확인
    if epoch % 500 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [0.7285600934134986, 0.6423040193614589]
500 [4.942726752245414, 11.692237628170421]
1000 [4.9691293209021055, 16.501531174187775]
1500 [4.980523087447129, 18.590411261023977]
2000 [4.985471879329149, 19.497700269800106]
2500 [4.987621349042239, 19.891774276545842]
3000 [4.988554954689417, 20.062937292178695]
3500 [4.988960459125764, 20.137280632310304]
4000 [4.989136586860627, 20.16957109062855]
4500 [4.989213086588395, 20.183596202986426]
최종 기울기: 20.190
최종 절편: 4.989


# 예측치
y_hat = [intercept + slope * x for x in X]

# 실제 데이터 분포
plt.plot(X, y, 'k.', label='Actuals')
# 예측치 그래프
plt.plot(X, y_hat, 'r-', label='Estimates')

plt.title("Linear Regression")
plt.legend()

plt.show()


from typing import List, Iterator

# 제너레이터 함수 정의
def minibatches(dataset: List[float],
                batch_size: int,
                shuffle: bool = True) -> Iterator[List[float]]:
    """
    dataset: 전체 데이터셋
    batch_size: 미니배치 크기
    shuffle: 섞기 옵션
    리턴값: 이터레이터
    """

    # 0번 인덱스부터 시작하여, batch_size 배수 번째에 해당하는 인덱스만 선택
    batch_starts = [start for start in range(0, len(dataset), batch_size)]
    
    # shuffle 옵션이 참이면 인덱스 섞기
    if shuffle: random.shuffle(batch_starts)

    # batch_starts에  포함된 인덱스를 기준으로 해서 미니배치 크기만큼씩 선택해서 
    # 다음 MSE와 그레이디언트 계산에 필요한 훈련 데이터 세트를 지정함.
    for start in batch_starts:
        end = start + batch_size
        yield dataset[start:end]


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.001

# 1000번의 에포크
for epoch in range(1000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트
    
    for batch in minibatches(inputs, batch_size=20):
        grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in batch])
        theta = gradient_step(theta, grad, learning_rate)

    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 100 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.14700601161032584, -0.850324957636322]
100 [3.0383003005334195, 0.81879593562066]
200 [4.21458258079334, 2.363808765349366]
300 [4.65037766647808, 3.7887387104243513]
400 [4.814916905651929, 5.100937311842601]
500 [4.880054707558669, 6.308581062740375]
600 [4.906941797916403, 7.419765388918784]
700 [4.921300749140654, 8.442074919728974]
800 [4.928451522266928, 9.382601070352939]
900 [4.934261548296817, 10.247863490712117]
최종 기울기: 11.036
최종 절편: 4.939


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.01

# 1000번의 에포크
for epoch in range(1000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트

    for batch in minibatches(inputs, batch_size=20):
        grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in batch])
        theta = gradient_step(theta, grad, learning_rate)
    
    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 100 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [1.237825277922001, 0.7249192271332124]
100 [4.952368308578804, 11.798388090708]
200 [4.970608256534244, 16.578080917985734]
300 [4.983392717759511, 18.637033407981054]
400 [4.986056813379833, 19.523825735584676]
500 [4.987595092175629, 19.90597277649338]
600 [4.988266095521663, 20.070316968833907]
700 [4.989040195003638, 20.14103697686222]
800 [4.989672228772879, 20.17161547560229]
900 [4.988703699118148, 20.184765698131514]
최종 기울기: 20.190
최종 절편: 4.989


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.01

# 1000번의 에포크
for epoch in range(3000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트
    
    for batch in minibatches(inputs, batch_size=20):
        grad = LA.vector_mean([linear_gradient(x, y_x, theta) for x, y_x in batch])
        theta = gradient_step(theta, grad, learning_rate)
    
    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 300 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.43549987222961334, 0.26254473969499653]
300 [4.978902536543744, 18.599116202912136]
600 [4.988010821486811, 20.067010548132913]
900 [4.989101560704833, 20.18451199354063]
1200 [4.989313445216165, 20.193984886037637]
1500 [4.98927937941379, 20.195015565259936]
1800 [4.988570103335046, 20.195105271575724]
2100 [4.989706094445955, 20.194956727051277]
2400 [4.990361301352804, 20.194971623699445]
2700 [4.989683544148361, 20.19504893855309]
최종 기울기: 20.194
최종 절편: 4.989


theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률
learning_rate = 0.001

# 에포크는 1000
for epoch in range(1000):
    
    # 매 훈련 샘플에 대해 예측값을 확인한 후 바로 theta 값 업데이트
    for x, y_x in inputs:
        grad = linear_gradient(x, y_x, theta)
        theta = gradient_step(theta, grad, learning_rate)
    
    if epoch % 100 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [0.550382182232026, 0.9895536437047407]
100 [5.032490886155949, 16.554658325069674]
200 [4.997519092010245, 19.51236265820966]
300 [4.990918695543546, 20.070584404084553]
400 [4.989672971014022, 20.175940275546683]
500 [4.9894378594398585, 20.19582459523417]
600 [4.9893934857031255, 20.19957745840122]
700 [4.989385110834649, 20.20028575429328]
800 [4.989383530205495, 20.200419434379114]
900 [4.989383231885758, 20.20044466446389]
최종 기울기: 20.200
최종 절편: 4.989


theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률
learning_rate = 0.01

# 에포크는 1000
for epoch in range(1000):
    for x, y_x in inputs:
        grad = linear_gradient(x, y_x, theta)
        theta = gradient_step(theta, grad, learning_rate)
    if epoch % 100 == 0:
        print(epoch, theta)

intercept, slope = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [6.8116974087303825, 2.3020744483619793]
100 [4.990193170861476, 20.26103005086055]
200 [4.990192825899716, 20.26103222379059]
300 [4.990192825899663, 20.26103222379092]
400 [4.990192825899663, 20.26103222379092]
500 [4.990192825899663, 20.26103222379092]
600 [4.990192825899663, 20.26103222379092]
700 [4.990192825899663, 20.26103222379092]
800 [4.990192825899663, 20.26103222379092]
900 [4.990192825899663, 20.26103222379092]
최종 기울기: 20.261
최종 절편: 4.990


from sklearn import linear_model

linreg_sgd = linear_model.SGDRegressor()


X = np.c_[X]
y = np.array(y)


linreg_sgd.fit(X, y)

SGDRegressor()


t0, t1 = linreg_sgd.intercept_[0], linreg_sgd.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.982221986296401
기울기:	 18.840822703467992


linreg_sgd_1 = linear_model.SGDRegressor(eta0=0.001)
linreg_sgd_1.fit(X, y)

t0, t1 = linreg_sgd_1.intercept_[0], linreg_sgd_1.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.927561524163672
기울기:	 9.365537800747957

/Users/gslee/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_stochastic_gradient.py:1208: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn("Maximum number of iteration reached before "


linreg_sgd_1 = linear_model.SGDRegressor(eta0=0.001, max_iter=5000)
linreg_sgd_1.fit(X, y)

t0, t1 = linreg_sgd_1.intercept_[0], linreg_sgd_1.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.95832372425737
기울기:	 14.503989381717352


linreg_sgd_1 = linear_model.SGDRegressor(eta0=0.001, max_iter=50000)
linreg_sgd_1.fit(X, y)

t0, t1 = linreg_sgd_1.intercept_[0], linreg_sgd_1.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.95823344002572
기울기:	 14.50321304652143


linreg_sgd_1 = linear_model.SGDRegressor(eta0=0.03)
linreg_sgd_1.fit(X, y)

t0, t1 = linreg_sgd_1.intercept_[0], linreg_sgd_1.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.983589984969054
기울기:	 19.604285712686572


linreg_sgd_1 = linear_model.SGDRegressor(eta0=0.03, max_iter=10000)
linreg_sgd_1.fit(X, y)

t0, t1 = linreg_sgd_1.intercept_[0], linreg_sgd_1.coef_[0]

print(f"절편:\t {t0}")
print(f"기울기:\t {t1}")

절편:	 4.985259928226336
기울기:	 19.597986815558308


X = 2 * np.random.rand(100)
y = 4 + 3 * X + np.random.randn(100)


# pass 부분을 적절한 코드로 대체하라.

pass

plt.xlabel("$x_1$", fontsize=18)             
plt.ylabel("$y$", rotation=0, fontsize=18)   
plt.axis([0, 2, 0, 15])                      
plt.show()


def gradient_step(v, gradient, learning_rate):
    step = gradient*learning_rate
    new_V = v - step

    return new_V


# pass와 None 각각을 적절한 코드와 값으로 대체하라.
def linear_gradient(X, y, theta):
    
    intercept, slope = theta[0], theta[1]           
    predicted = intercept + slope * X  
    error = (predicted - y) 
    pass

    return None


# pass문을 적절한 코드로 대체하여 배치 경사하강법을 구현하라.

theta = np.random.uniform(-1, 1, 2)

pass


# pass문을 적절한 코드로 대체하여 미니배치 경사하강법을 구현하라.

theta = np.random.uniform(-1, 1, 2)

pass


# pass문을 적절한 코드로 대체하여 확률적 경사하강법을 구현하라.

theta = np.random.uniform(-1, 1, 2)

pass

변수	의미
intercept	$\theta_0$ (절편)
slope	$\theta_1$ (기울기)
predicted	$\hat y$ (예측값)
error	$\hat y - y$ (오차)
grad	$x$에 대한 제곱오차의 그레이디언트

머신러닝 맛보기 2편¶

주요 내용¶

기본 설정¶

주의사항¶

준비사항¶

1. 경사하강법¶

경사하강법 기본 아이디어¶

그레이디언트의 정의와 의미¶

경사하강법 작동 방식¶

주의사항¶

2. 파이썬으로 그레이디언트 계산¶

단변수 함수의 도함수 계산¶

도함수 근사치¶

다변수 함수의 그레이디언트 계산¶

편도함수 근사치¶

3. 경사하강법으로 최솟점 구하기¶

에포크와 학습률¶

4. 경사하강법과 선형회귀¶

훈련 세트 준비¶

목표¶

비용함수¶

비용함수의 그레이언트¶

경사하강법 활용¶

MSE() 함수의 최솟값 지점 찾기¶

5. 배치/미니배치/확률적 경사하강법¶

미니배치 경사하강법¶

확률적 경사하강법(SGD)¶

경사하강법 비교¶

사이킷런의 확률적 경사하강법 모델¶

연습문제¶

문제 1¶

문제 2¶

문제 3¶

문제 4¶

문제 5¶

문제 6¶