# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "unsupervised_learning"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


from sklearn.datasets import load_iris


data = load_iris()

X = data.data
y = data.target


data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')


plt.figure(figsize=(9, 3.5))

# 왼편 그림: 분류
plt.subplot(121)
plt.plot(X[y==0, 2], X[y==0, 3], "yo", label="Iris setosa")
plt.plot(X[y==1, 2], X[y==1, 3], "bs", label="Iris versicolor")
plt.plot(X[y==2, 2], X[y==2, 3], "g^", label="Iris virginica")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(fontsize=12)

# 오른편 그림: 군집화
plt.subplot(122)
plt.scatter(X[:, 2], X[:, 3], c="k", marker=".")
plt.xlabel("Petal length", fontsize=14)
plt.tick_params(labelleft=False)

save_fig("classification_vs_clustering_plot")
plt.show()

Saving figure classification_vs_clustering_plot


from sklearn.mixture import GaussianMixture

y_pred = GaussianMixture(n_components=3, random_state=42).fit(X).predict(X)


plt.figure(figsize=(9, 3.5))

# 왼편 그림: 실제 품종별 분류
plt.subplot(121)
plt.plot(X[y==0, 2], X[y==0, 3], "yo", label="Iris setosa")
plt.plot(X[y==1, 2], X[y==1, 3], "bs", label="Iris versicolor")
plt.plot(X[y==2, 2], X[y==2, 3], "g^", label="Iris virginica")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(fontsize=12)

# 오른편 그림: 3개의 군집
plt.subplot(122)
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "yo", label="Cluster 1")   # 2번 군집: 세토사
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "bs", label="Cluster 2")   # 0번 군집: 버시컬러
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "g^", label="Cluster 3")   # 1번 군집: 버지니카
plt.xlabel("Petal length", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
plt.tick_params(labelleft=False)

save_fig("classification_vs_clustering_plot")
plt.show()

Saving figure classification_vs_clustering_plot


from scipy import stats

mapping = {}

for class_id in np.unique(y):                   # 품종 아이디: 0, 1, 2
    mode, _ = stats.mode(y_pred[y==class_id])   # mode: 지정된 품종이 가장 많이 포함된 군집 인덱스
    mapping[mode[0]] = class_id                 # 군집 인덱스와 품종 연결


mapping

{2: 0, 0: 1, 1: 2}


y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])

np.sum(y_pred==y) / len(y_pred)

0.9666666666666667


blob_centers = np.array(
    [[ 0.2,  2.3],
     [-1.5 ,  2.3],
     [-2.8,  1.8],
     [-2.8,  2.8],
     [-2.8,  1.3]])

blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])


from sklearn.datasets import make_blobs


X, _ = make_blobs(n_samples=2000, centers=blob_centers,
                  cluster_std=blob_std, random_state=7)


def plot_clusters(X, y=None):
    plt.scatter(X[:, 0], X[:, 1], c=y, s=1)
    plt.xlabel("$x_1$", fontsize=14)
    plt.ylabel("$x_2$", fontsize=14, rotation=0)


plt.figure(figsize=(8, 4))
plot_clusters(X)
save_fig("blobs_plot")
plt.show()

Saving figure blobs_plot


from sklearn.cluster import KMeans


k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)

KMeans(n_clusters=5, random_state=42)


kmeans.labels_

array([0, 4, 1, ..., 2, 1, 4])


kmeans.cluster_centers_

array([[-2.80037642,  1.30082566],
       [ 0.20876306,  2.25551336],
       [-2.79290307,  2.79641063],
       [-1.46679593,  2.28585348],
       [-2.80389616,  1.80117999]])


X_new = np.array([[0, 2], [3, 2], [-3, 3], [-3, 2.5]])

kmeans.predict(X_new)

array([1, 1, 2, 2])


# 산점도 그리기

def plot_data(X):
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)


# 센트로이드 그리기
def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]

    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=35, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=2, linewidths=12,
                color=cross_color, zorder=11, alpha=1)


def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
                             show_xlabels=True, show_ylabels=True):
    
    # 바탕화면 그리기
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    
    # 훈련 샘플 산점도 그리기
    plot_data(X)
    
    # 센트로이드 그리기
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

    # 기타: x, y 축 레이블
    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)


plt.figure(figsize=(8, 4))

plot_decision_boundaries(kmeans, X)

save_fig("voronoi_plot")
plt.show()

Saving figure voronoi_plot


kmeans.transform(X_new)

array([[2.88633901, 0.32995317, 2.9042344 , 1.49439034, 2.81093633],
       [5.84236351, 2.80290755, 5.84739223, 4.4759332 , 5.80730058],
       [1.71086031, 3.29399768, 0.29040966, 1.69136631, 1.21475352],
       [1.21567622, 3.21806371, 0.36159148, 1.54808703, 0.72581411]])


np.linalg.norm(np.tile(X_new, (1, k)).reshape(-1, k, 2) - kmeans.cluster_centers_, axis=2)

array([[2.88633901, 0.32995317, 2.9042344 , 1.49439034, 2.81093633],
       [5.84236351, 2.80290755, 5.84739223, 4.4759332 , 5.80730058],
       [1.71086031, 3.29399768, 0.29040966, 1.69136631, 1.21475352],
       [1.21567622, 3.21806371, 0.36159148, 1.54808703, 0.72581411]])


kmeans_iter1 = KMeans(n_clusters=5, init="random", n_init=1,
                     algorithm="full", max_iter=1, random_state=0)

kmeans_iter2 = KMeans(n_clusters=5, init="random", n_init=1,
                     algorithm="full", max_iter=2, random_state=0)

kmeans_iter3 = KMeans(n_clusters=5, init="random", n_init=1,
                     algorithm="full", max_iter=3, random_state=0)
kmeans_iter1.fit(X)
kmeans_iter2.fit(X)
kmeans_iter3.fit(X)

KMeans(algorithm='full', init='random', max_iter=3, n_clusters=5, n_init=1,
       random_state=0)


plt.figure(figsize=(10, 8))

# 맨 위 왼편
plt.subplot(321)
plot_data(X)
plot_centroids(kmeans_iter1.cluster_centers_, circle_color='r', cross_color='w')
plt.ylabel("$x_2$", fontsize=14, rotation=0)
plt.tick_params(labelbottom=False)
plt.title("Update the centroids (initially randomly)", fontsize=14)

# 맨 위 오른편
plt.subplot(322)
plot_decision_boundaries(kmeans_iter1, X, show_xlabels=False, show_ylabels=False)
plt.title("Label the instances", fontsize=14)

# 가운데 왼편
plt.subplot(323)
plot_decision_boundaries(kmeans_iter1, X, show_centroids=False, show_xlabels=False)
plot_centroids(kmeans_iter2.cluster_centers_)

# 가운데 오른편
plt.subplot(324)
plot_decision_boundaries(kmeans_iter2, X, show_xlabels=False, show_ylabels=False)

# 맨 아래 왼편
plt.subplot(325)
plot_decision_boundaries(kmeans_iter2, X, show_centroids=False)
plot_centroids(kmeans_iter3.cluster_centers_)

# 맨 아래 오른편
plt.subplot(326)
plot_decision_boundaries(kmeans_iter3, X, show_ylabels=False)

save_fig("kmeans_algorithm_plot")
plt.show()

Saving figure kmeans_algorithm_plot


def plot_clusterer_comparison(clusterer1, clusterer2, X, title1=None, title2=None):
    clusterer1.fit(X)
    clusterer2.fit(X)

    plt.figure(figsize=(10, 3.2))

    plt.subplot(121)
    plot_decision_boundaries(clusterer1, X)
    if title1:
        plt.title(title1, fontsize=14)

    plt.subplot(122)
    plot_decision_boundaries(clusterer2, X, show_ylabels=False)
    if title2:
        plt.title(title2, fontsize=14)


kmeans_rnd_init1 = KMeans(n_clusters=5, init="random", n_init=1,
                         algorithm="full", random_state=2)
kmeans_rnd_init2 = KMeans(n_clusters=5, init="random", n_init=1,
                         algorithm="full", random_state=5)

plot_clusterer_comparison(kmeans_rnd_init1, kmeans_rnd_init2, X,
                          "Solution 1", "Solution 2 (with a different random init)")

save_fig("kmeans_variability_plot")
plt.show()

Saving figure kmeans_variability_plot


kmeans.inertia_

211.5985372581683


X_dist = kmeans.transform(X)  # 각 샘플과 센트로이드들 사이의 거리

np.sum(X_dist[np.arange(len(X_dist)), kmeans.labels_]**2)   # 팬시 인덱싱 활용

211.59853725816873


kmeans.score(X)

-211.59853725816828


kmeans_rnd_init1.inertia_

219.84385402233193


kmeans_rnd_init2.inertia_

236.95563196978736


kmeans_rnd_10_inits = KMeans(n_clusters=5, init="random", n_init=10,
                              algorithm="full", random_state=2)
kmeans_rnd_10_inits.fit(X)

KMeans(algorithm='full', init='random', n_clusters=5, random_state=2)


plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans_rnd_10_inits, X)
plt.show()


kmeans_rnd_10_inits.inertia_

211.60576489487653


blob_centers

array([[ 0.2,  2.3],
       [-1.5,  2.3],
       [-2.8,  1.8],
       [-2.8,  2.8],
       [-2.8,  1.3]])


kmeans.cluster_centers_

array([[-2.80037642,  1.30082566],
       [ 0.20876306,  2.25551336],
       [-2.79290307,  2.79641063],
       [-1.46679593,  2.28585348],
       [-2.80389616,  1.80117999]])


good_init = np.array([[-3, 3], [-3, 2], [-3, 1], [-1, 2], [0, 2]])

kmeans = KMeans(n_clusters=5, init=good_init, n_init=1, random_state=42)
kmeans.fit(X)
kmeans.inertia_

211.62337889822365


%timeit -n 50 KMeans(algorithm="elkan", random_state=42).fit(X)

54 ms ± 905 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


%timeit -n 50 KMeans(algorithm="full", random_state=42).fit(X)

97.8 ms ± 908 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


from sklearn.cluster import MiniBatchKMeans


minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)
minibatch_kmeans.fit(X)

MiniBatchKMeans(n_clusters=5, random_state=42)


minibatch_kmeans.inertia_

211.93186531476786


import urllib.request
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.int64)   # 타깃의 자료형을 변환해줄 필요 있음


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    mnist["data"], mnist["target"], random_state=42)


filename = "my_mnist.data"
X_mm = np.memmap(filename, dtype='float32', mode='write', shape=X_train.shape)
X_mm[:] = X_train


minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10, random_state=42)
minibatch_kmeans.fit(X_mm)

MiniBatchKMeans(batch_size=10, n_clusters=10, random_state=42)


def load_next_batch(batch_size):
    return X[np.random.choice(len(X), batch_size, replace=False)]


k = 5               # 센트로이드 개수
n_init = 10         # 센트로이드 초기화 횟수
n_iterations = 100  # 센트로이드 조정 횟수
batch_size = 100    # 배치 크기
init_size = 500     # k-평균++ 알고리즘의 초기화 후보에 사용될 데이터셋 크기


np.random.seed(42)

evaluate_on_last_n_iters = 10  # 센트로이드 조정 마지막 10단계 모델의 관성 누적합 저장 기준
best_kmeans = None             # 최고 모델 저장

for init in range(n_init):     # 초기화 반복
    
    # 미니배티 k-평균 모델 초기화 및 partial_fit() 훈련
    minibatch_kmeans = MiniBatchKMeans(n_clusters=k, init_size=init_size)
    X_init = load_next_batch(init_size)
    minibatch_kmeans.partial_fit(X_init)

    # 센트로이드 조정 마지막 10단계 모델의 관성 누적합 저장
    minibatch_kmeans.sum_inertia_ = 0
    
    # 센트로이드 조정
    for iteration in range(n_iterations):
        X_batch = load_next_batch(batch_size)
        minibatch_kmeans.partial_fit(X_batch)
        
        # 누적 관성 계산
        if iteration >= n_iterations - evaluate_on_last_n_iters:
            minibatch_kmeans.sum_inertia_ += minibatch_kmeans.inertia_

    # 최저 누적 관성 모델 업데이트
    if (best_kmeans is None or
            minibatch_kmeans.sum_inertia_ < best_kmeans.sum_inertia_):
        best_kmeans = minibatch_kmeans


best_kmeans.score(X)

-211.70999744411446


%timeit KMeans(n_clusters=5, random_state=42).fit(X)

27.1 ms ± 370 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


%timeit MiniBatchKMeans(n_clusters=5, random_state=42).fit(X)

12.2 ms ± 825 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


from timeit import timeit


times = np.empty((100, 2))
inertias = np.empty((100, 2))

for k in range(1, 101):
    kmeans_ = KMeans(n_clusters=k, random_state=42)
    minibatch_kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)

    print("\r{}/{}".format(k, 100), end="")
    times[k-1, 0] = timeit("kmeans_.fit(X)", number=10, globals=globals())
    times[k-1, 1]  = timeit("minibatch_kmeans.fit(X)", number=10, globals=globals())
    inertias[k-1, 0] = kmeans_.inertia_
    inertias[k-1, 1] = minibatch_kmeans.inertia_

100/100


plt.figure(figsize=(10,4))

# 왼편 그림
plt.subplot(121)

plt.plot(range(1, 101), inertias[:, 0], "r--", label="K-Means")            # 빨강 파선
plt.plot(range(1, 101), inertias[:, 1], "b.-", label="Mini-batch K-Means") # 파랑 실선

plt.xlabel("$k$", fontsize=16)
plt.title("Inertia", fontsize=14)
plt.legend(fontsize=14)
plt.axis([1, 100, 0, 100])

# 오른편 그림
plt.subplot(122)

plt.plot(range(1, 101), times[:, 0], "r--", label="K-Means")            # 빨강 파선
plt.plot(range(1, 101), times[:, 1], "b.-", label="Mini-batch K-Means") # 파랑 실선

plt.xlabel("$k$", fontsize=16)
plt.title("Training time (seconds)", fontsize=14)
plt.axis([1, 100, 0, 6])

save_fig("minibatch_kmeans_vs_kmeans")
plt.show()

Saving figure minibatch_kmeans_vs_kmeans


kmeans_k3 = KMeans(n_clusters=3, random_state=42)  # 3개의 군집
kmeans_k8 = KMeans(n_clusters=8, random_state=42)  # 8개의 군집

plot_clusterer_comparison(kmeans_k3, kmeans_k8, X, "$k=3$", "$k=8$")
save_fig("bad_n_clusters_plot")
plt.show()

Saving figure bad_n_clusters_plot


kmeans_k3.inertia_

653.2223267580945


kmeans_k8.inertia_

118.44108623570082


kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]


plt.figure(figsize=(8, 3.5))

# 군집수와 관성 관계
plt.plot(range(1, 10), inertias, "bo-")

plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)

# 주석 작성: Elbow 단어와 화살표 표시
plt.annotate('Elbow',
             xy=(4, inertias[3]),
             xytext=(0.55, 0.55),
             textcoords='figure fraction',
             fontsize=16,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )

plt.axis([1, 8.5, 0, 1300])
save_fig("inertia_vs_k_plot")
plt.show()

Saving figure inertia_vs_k_plot


plot_decision_boundaries(kmeans_per_k[4-1], X)
plt.show()


from sklearn.metrics import silhouette_score


silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]


plt.figure(figsize=(8, 3))
plt.plot(range(2, 10), silhouette_scores, "bo-")

plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.axis([1.8, 8.5, 0.55, 0.7])
save_fig("silhouette_score_vs_k_plot")
plt.show()

Saving figure silhouette_score_vs_k_plot


from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

save_fig("silhouette_analysis_plot")
plt.show()

Saving figure silhouette_analysis_plot


X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))

X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]

X = np.r_[X1, X2]
y = np.r_[y1, y2]


plot_clusters(X)


kmeans_good = KMeans(n_clusters=3, init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]), n_init=1, random_state=42)
kmeans_good.fit(X)

KMeans(init=array([[-1.5,  2.5],
       [ 0.5,  0. ],
       [ 4. ,  0. ]]),
       n_clusters=3, n_init=1, random_state=42)


kmeans_bad = KMeans(n_clusters=3, random_state=42)
kmeans_bad.fit(X)

KMeans(n_clusters=3, random_state=42)


plt.figure(figsize=(10, 3.2))

plt.subplot(121)
plot_decision_boundaries(kmeans_good, X)
plt.title("Inertia = {:.1f}".format(kmeans_good.inertia_), fontsize=14)

plt.subplot(122)
plot_decision_boundaries(kmeans_bad, X, show_ylabels=False)
plt.title("Inertia = {:.1f}".format(kmeans_bad.inertia_), fontsize=14)

save_fig("bad_kmeans_plot")
plt.show()

Saving figure bad_kmeans_plot


# 무당벌레 이미지 다운로드
images_path = os.path.join(PROJECT_ROOT_DIR, "images", "unsupervised_learning")
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"

filename = "ladybug.png"
print("Downloading", filename)

url = DOWNLOAD_ROOT + "images/unsupervised_learning/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))

Downloading ladybug.png

('.\\images\\unsupervised_learning\\ladybug.png',
 <http.client.HTTPMessage at 0x2bfbf3f1370>)


from matplotlib.image import imread
image = imread(os.path.join(images_path, filename))

image.shape

(533, 800, 3)


X = image.reshape(-1, 3)
X.shape

(426400, 3)


kmeans = KMeans(n_clusters=8, random_state=42).fit(X)


kmeans.cluster_centers_

array([[0.6125396 , 0.38346282, 0.09190764],
       [0.0210796 , 0.10577151, 0.00556033],
       [0.9840446 , 0.9394471 , 0.02596566],
       [0.20604077, 0.37261233, 0.05183628],
       [0.60494506, 0.63328826, 0.3915232 ],
       [0.89790714, 0.7377076 , 0.03252882],
       [0.3422731 , 0.52337915, 0.15179527],
       [0.09099391, 0.2427601 , 0.01513617]], dtype=float32)


segmented_img = kmeans.cluster_centers_[kmeans.labels_]


segmented_img = segmented_img.reshape(image.shape)


segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)

for n_clusters in n_colors:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
    segmented_img = kmeans.cluster_centers_[kmeans.labels_]
    segmented_imgs.append(segmented_img.reshape(image.shape))


plt.figure(figsize=(10,5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)

# 원본 이미지
plt.subplot(231)
plt.imshow(image)
plt.title("Original image")
plt.axis('off')

# 색상 분할된 이미지 5개
for idx, n_clusters in enumerate(n_colors):
    plt.subplot(232 + idx)
    plt.imshow(segmented_imgs[idx])
    plt.title("{} colors".format(n_clusters))
    plt.axis('off')

save_fig('image_segmentation_diagram', tight_layout=False)
plt.show()

Saving figure image_segmentation_diagram


from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)


from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train)

LogisticRegression(max_iter=5000, multi_class='ovr', random_state=42)


log_reg.score(X_test, y_test)

0.9688888888888889


from sklearn.pipeline import Pipeline


pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('kmeans', KMeans(n_clusters=50, random_state=42)),
                ('log_reg',
                 LogisticRegression(max_iter=5000, multi_class='ovr',
                                    random_state=42))])


pipeline.score(X_test, y_test)

0.98


1 - (1 - 0.977777) / (1 - 0.968888)

0.28570969400874346


from sklearn.model_selection import GridSearchCV


param_grid = dict(kmeans__n_clusters=range(2, 100))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s

[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.1s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.2s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.2s
[CV] kmeans__n_clusters=5 ............................................
[CV] ............................. kmeans__n_clusters=5, total=   0.2s
[CV] kmeans__n_clusters=5 ............................................
[CV] ............................. kmeans__n_clusters=5, total=   0.2s
[CV] kmeans__n_clusters=5 ............................................
[CV] ............................. kmeans__n_clusters=5, total=   0.2s
[CV] kmeans__n_clusters=6 ............................................
[CV] ............................. kmeans__n_clusters=6, total=   0.2s
[CV] kmeans__n_clusters=6 ............................................
[CV] ............................. kmeans__n_clusters=6, total=   0.2s
[CV] kmeans__n_clusters=6 ............................................
[CV] ............................. kmeans__n_clusters=6, total=   0.2s
[CV] kmeans__n_clusters=7 ............................................
[CV] ............................. kmeans__n_clusters=7, total=   0.2s
[CV] kmeans__n_clusters=7 ............................................
[CV] ............................. kmeans__n_clusters=7, total=   0.2s
[CV] kmeans__n_clusters=7 ............................................
[CV] ............................. kmeans__n_clusters=7, total=   0.3s
[CV] kmeans__n_clusters=8 ............................................
[CV] ............................. kmeans__n_clusters=8, total=   0.3s
[CV] kmeans__n_clusters=8 ............................................
[CV] ............................. kmeans__n_clusters=8, total=   0.3s
[CV] kmeans__n_clusters=8 ............................................
[CV] ............................. kmeans__n_clusters=8, total=   0.3s
[CV] kmeans__n_clusters=9 ............................................
[CV] ............................. kmeans__n_clusters=9, total=   0.4s
[CV] kmeans__n_clusters=9 ............................................
[CV] ............................. kmeans__n_clusters=9, total=   0.3s
[CV] kmeans__n_clusters=9 ............................................
[CV] ............................. kmeans__n_clusters=9, total=   0.4s
[CV] kmeans__n_clusters=10 ...........................................
[CV] ............................ kmeans__n_clusters=10, total=   0.5s
[CV] kmeans__n_clusters=10 ...........................................
[CV] ............................ kmeans__n_clusters=10, total=   0.5s
[CV] kmeans__n_clusters=10 ...........................................
[CV] ............................ kmeans__n_clusters=10, total=   0.5s
[CV] kmeans__n_clusters=11 ...........................................
[CV] ............................ kmeans__n_clusters=11, total=   0.6s
[CV] kmeans__n_clusters=11 ...........................................
[CV] ............................ kmeans__n_clusters=11, total=   0.6s
[CV] kmeans__n_clusters=11 ...........................................
[CV] ............................ kmeans__n_clusters=11, total=   0.5s
[CV] kmeans__n_clusters=12 ...........................................
[CV] ............................ kmeans__n_clusters=12, total=   0.7s
[CV] kmeans__n_clusters=12 ...........................................
[CV] ............................ kmeans__n_clusters=12, total=   0.7s
[CV] kmeans__n_clusters=12 ...........................................
[CV] ............................ kmeans__n_clusters=12, total=   0.8s
[CV] kmeans__n_clusters=13 ...........................................
[CV] ............................ kmeans__n_clusters=13, total=   0.8s
[CV] kmeans__n_clusters=13 ...........................................
[CV] ............................ kmeans__n_clusters=13, total=   0.8s
[CV] kmeans__n_clusters=13 ...........................................
[CV] ............................ kmeans__n_clusters=13, total=   0.9s
[CV] kmeans__n_clusters=14 ...........................................
[CV] ............................ kmeans__n_clusters=14, total=   0.9s
[CV] kmeans__n_clusters=14 ...........................................
[CV] ............................ kmeans__n_clusters=14, total=   1.0s
[CV] kmeans__n_clusters=14 ...........................................
[CV] ............................ kmeans__n_clusters=14, total=   0.9s
[CV] kmeans__n_clusters=15 ...........................................
[CV] ............................ kmeans__n_clusters=15, total=   1.1s
[CV] kmeans__n_clusters=15 ...........................................
[CV] ............................ kmeans__n_clusters=15, total=   1.0s
[CV] kmeans__n_clusters=15 ...........................................
[CV] ............................ kmeans__n_clusters=15, total=   1.0s
[CV] kmeans__n_clusters=16 ...........................................
[CV] ............................ kmeans__n_clusters=16, total=   1.2s
[CV] kmeans__n_clusters=16 ...........................................
[CV] ............................ kmeans__n_clusters=16, total=   1.1s
[CV] kmeans__n_clusters=16 ...........................................
[CV] ............................ kmeans__n_clusters=16, total=   1.1s
[CV] kmeans__n_clusters=17 ...........................................
[CV] ............................ kmeans__n_clusters=17, total=   1.4s
[CV] kmeans__n_clusters=17 ...........................................
[CV] ............................ kmeans__n_clusters=17, total=   1.2s
[CV] kmeans__n_clusters=17 ...........................................
[CV] ............................ kmeans__n_clusters=17, total=   1.2s
[CV] kmeans__n_clusters=18 ...........................................
[CV] ............................ kmeans__n_clusters=18, total=   1.4s
[CV] kmeans__n_clusters=18 ...........................................
[CV] ............................ kmeans__n_clusters=18, total=   1.3s
[CV] kmeans__n_clusters=18 ...........................................
[CV] ............................ kmeans__n_clusters=18, total=   1.4s
[CV] kmeans__n_clusters=19 ...........................................
[CV] ............................ kmeans__n_clusters=19, total=   1.5s
[CV] kmeans__n_clusters=19 ...........................................
[CV] ............................ kmeans__n_clusters=19, total=   1.3s
[CV] kmeans__n_clusters=19 ...........................................
[CV] ............................ kmeans__n_clusters=19, total=   1.3s
[CV] kmeans__n_clusters=20 ...........................................
[CV] ............................ kmeans__n_clusters=20, total=   1.6s
[CV] kmeans__n_clusters=20 ...........................................
[CV] ............................ kmeans__n_clusters=20, total=   1.4s
[CV] kmeans__n_clusters=20 ...........................................
[CV] ............................ kmeans__n_clusters=20, total=   1.5s
[CV] kmeans__n_clusters=21 ...........................................
[CV] ............................ kmeans__n_clusters=21, total=   1.7s
[CV] kmeans__n_clusters=21 ...........................................
[CV] ............................ kmeans__n_clusters=21, total=   1.7s
[CV] kmeans__n_clusters=21 ...........................................
[CV] ............................ kmeans__n_clusters=21, total=   1.6s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   1.6s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   1.8s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   1.7s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   1.8s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   1.9s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   1.7s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   1.9s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   1.8s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   1.9s
[CV] kmeans__n_clusters=25 ...........................................
[CV] ............................ kmeans__n_clusters=25, total=   1.8s
[CV] kmeans__n_clusters=25 ...........................................
[CV] ............................ kmeans__n_clusters=25, total=   2.0s
[CV] kmeans__n_clusters=25 ...........................................
[CV] ............................ kmeans__n_clusters=25, total=   1.8s
[CV] kmeans__n_clusters=26 ...........................................
[CV] ............................ kmeans__n_clusters=26, total=   2.1s
[CV] kmeans__n_clusters=26 ...........................................
[CV] ............................ kmeans__n_clusters=26, total=   2.0s
[CV] kmeans__n_clusters=26 ...........................................
[CV] ............................ kmeans__n_clusters=26, total=   1.9s
[CV] kmeans__n_clusters=27 ...........................................
[CV] ............................ kmeans__n_clusters=27, total=   2.0s
[CV] kmeans__n_clusters=27 ...........................................
[CV] ............................ kmeans__n_clusters=27, total=   2.1s
[CV] kmeans__n_clusters=27 ...........................................
[CV] ............................ kmeans__n_clusters=27, total=   2.0s
[CV] kmeans__n_clusters=28 ...........................................
[CV] ............................ kmeans__n_clusters=28, total=   2.2s
[CV] kmeans__n_clusters=28 ...........................................
[CV] ............................ kmeans__n_clusters=28, total=   2.0s
[CV] kmeans__n_clusters=28 ...........................................
[CV] ............................ kmeans__n_clusters=28, total=   2.2s
[CV] kmeans__n_clusters=29 ...........................................
[CV] ............................ kmeans__n_clusters=29, total=   2.5s
[CV] kmeans__n_clusters=29 ...........................................
[CV] ............................ kmeans__n_clusters=29, total=   2.3s
[CV] kmeans__n_clusters=29 ...........................................
[CV] ............................ kmeans__n_clusters=29, total=   2.0s
[CV] kmeans__n_clusters=30 ...........................................
[CV] ............................ kmeans__n_clusters=30, total=   2.1s
[CV] kmeans__n_clusters=30 ...........................................
[CV] ............................ kmeans__n_clusters=30, total=   2.1s
[CV] kmeans__n_clusters=30 ...........................................
[CV] ............................ kmeans__n_clusters=30, total=   1.9s
[CV] kmeans__n_clusters=31 ...........................................
[CV] ............................ kmeans__n_clusters=31, total=   2.2s
[CV] kmeans__n_clusters=31 ...........................................
[CV] ............................ kmeans__n_clusters=31, total=   2.2s
[CV] kmeans__n_clusters=31 ...........................................
[CV] ............................ kmeans__n_clusters=31, total=   2.0s
[CV] kmeans__n_clusters=32 ...........................................
[CV] ............................ kmeans__n_clusters=32, total=   2.5s
[CV] kmeans__n_clusters=32 ...........................................
[CV] ............................ kmeans__n_clusters=32, total=   2.1s
[CV] kmeans__n_clusters=32 ...........................................
[CV] ............................ kmeans__n_clusters=32, total=   2.2s
[CV] kmeans__n_clusters=33 ...........................................
[CV] ............................ kmeans__n_clusters=33, total=   2.2s
[CV] kmeans__n_clusters=33 ...........................................
[CV] ............................ kmeans__n_clusters=33, total=   2.3s
[CV] kmeans__n_clusters=33 ...........................................
[CV] ............................ kmeans__n_clusters=33, total=   2.2s
[CV] kmeans__n_clusters=34 ...........................................
[CV] ............................ kmeans__n_clusters=34, total=   2.4s
[CV] kmeans__n_clusters=34 ...........................................
[CV] ............................ kmeans__n_clusters=34, total=   2.4s
[CV] kmeans__n_clusters=34 ...........................................
[CV] ............................ kmeans__n_clusters=34, total=   2.2s
[CV] kmeans__n_clusters=35 ...........................................
[CV] ............................ kmeans__n_clusters=35, total=   2.3s
[CV] kmeans__n_clusters=35 ...........................................
[CV] ............................ kmeans__n_clusters=35, total=   2.4s
[CV] kmeans__n_clusters=35 ...........................................
[CV] ............................ kmeans__n_clusters=35, total=   2.1s
[CV] kmeans__n_clusters=36 ...........................................
[CV] ............................ kmeans__n_clusters=36, total=   2.3s
[CV] kmeans__n_clusters=36 ...........................................
[CV] ............................ kmeans__n_clusters=36, total=   2.5s
[CV] kmeans__n_clusters=36 ...........................................
[CV] ............................ kmeans__n_clusters=36, total=   2.3s
[CV] kmeans__n_clusters=37 ...........................................
[CV] ............................ kmeans__n_clusters=37, total=   2.7s
[CV] kmeans__n_clusters=37 ...........................................
[CV] ............................ kmeans__n_clusters=37, total=   2.3s
[CV] kmeans__n_clusters=37 ...........................................
[CV] ............................ kmeans__n_clusters=37, total=   2.5s
[CV] kmeans__n_clusters=38 ...........................................
[CV] ............................ kmeans__n_clusters=38, total=   2.2s
[CV] kmeans__n_clusters=38 ...........................................
[CV] ............................ kmeans__n_clusters=38, total=   2.1s
[CV] kmeans__n_clusters=38 ...........................................
[CV] ............................ kmeans__n_clusters=38, total=   2.3s
[CV] kmeans__n_clusters=39 ...........................................
[CV] ............................ kmeans__n_clusters=39, total=   2.3s
[CV] kmeans__n_clusters=39 ...........................................
[CV] ............................ kmeans__n_clusters=39, total=   2.5s
[CV] kmeans__n_clusters=39 ...........................................
[CV] ............................ kmeans__n_clusters=39, total=   2.1s
[CV] kmeans__n_clusters=40 ...........................................
[CV] ............................ kmeans__n_clusters=40, total=   2.2s
[CV] kmeans__n_clusters=40 ...........................................
[CV] ............................ kmeans__n_clusters=40, total=   2.5s
[CV] kmeans__n_clusters=40 ...........................................
[CV] ............................ kmeans__n_clusters=40, total=   2.4s
[CV] kmeans__n_clusters=41 ...........................................
[CV] ............................ kmeans__n_clusters=41, total=   2.3s
[CV] kmeans__n_clusters=41 ...........................................
[CV] ............................ kmeans__n_clusters=41, total=   2.3s
[CV] kmeans__n_clusters=41 ...........................................
[CV] ............................ kmeans__n_clusters=41, total=   2.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   2.3s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   2.5s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   2.4s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   2.6s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   2.7s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   2.4s
[CV] kmeans__n_clusters=44 ...........................................
[CV] ............................ kmeans__n_clusters=44, total=   2.8s
[CV] kmeans__n_clusters=44 ...........................................
[CV] ............................ kmeans__n_clusters=44, total=   2.6s
[CV] kmeans__n_clusters=44 ...........................................
[CV] ............................ kmeans__n_clusters=44, total=   2.4s
[CV] kmeans__n_clusters=45 ...........................................
[CV] ............................ kmeans__n_clusters=45, total=   2.5s
[CV] kmeans__n_clusters=45 ...........................................
[CV] ............................ kmeans__n_clusters=45, total=   2.6s
[CV] kmeans__n_clusters=45 ...........................................
[CV] ............................ kmeans__n_clusters=45, total=   2.7s
[CV] kmeans__n_clusters=46 ...........................................
[CV] ............................ kmeans__n_clusters=46, total=   2.4s
[CV] kmeans__n_clusters=46 ...........................................
[CV] ............................ kmeans__n_clusters=46, total=   2.6s
[CV] kmeans__n_clusters=46 ...........................................
[CV] ............................ kmeans__n_clusters=46, total=   2.6s
[CV] kmeans__n_clusters=47 ...........................................
[CV] ............................ kmeans__n_clusters=47, total=   2.5s
[CV] kmeans__n_clusters=47 ...........................................
[CV] ............................ kmeans__n_clusters=47, total=   2.4s
[CV] kmeans__n_clusters=47 ...........................................
[CV] ............................ kmeans__n_clusters=47, total=   2.8s
[CV] kmeans__n_clusters=48 ...........................................
[CV] ............................ kmeans__n_clusters=48, total=   2.4s
[CV] kmeans__n_clusters=48 ...........................................
[CV] ............................ kmeans__n_clusters=48, total=   2.7s
[CV] kmeans__n_clusters=48 ...........................................
[CV] ............................ kmeans__n_clusters=48, total=   2.7s
[CV] kmeans__n_clusters=49 ...........................................
[CV] ............................ kmeans__n_clusters=49, total=   2.5s
[CV] kmeans__n_clusters=49 ...........................................
[CV] ............................ kmeans__n_clusters=49, total=   2.5s
[CV] kmeans__n_clusters=49 ...........................................
[CV] ............................ kmeans__n_clusters=49, total=   2.8s
[CV] kmeans__n_clusters=50 ...........................................
[CV] ............................ kmeans__n_clusters=50, total=   2.6s
[CV] kmeans__n_clusters=50 ...........................................
[CV] ............................ kmeans__n_clusters=50, total=   2.5s
[CV] kmeans__n_clusters=50 ...........................................
[CV] ............................ kmeans__n_clusters=50, total=   2.6s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   2.7s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   2.6s
[CV] kmeans__n_clusters=51 ...........................................
[CV] ............................ kmeans__n_clusters=51, total=   2.9s
[CV] kmeans__n_clusters=52 ...........................................
[CV] ............................ kmeans__n_clusters=52, total=   2.5s
[CV] kmeans__n_clusters=52 ...........................................
[CV] ............................ kmeans__n_clusters=52, total=   2.6s
[CV] kmeans__n_clusters=52 ...........................................
[CV] ............................ kmeans__n_clusters=52, total=   2.7s
[CV] kmeans__n_clusters=53 ...........................................
[CV] ............................ kmeans__n_clusters=53, total=   2.7s
[CV] kmeans__n_clusters=53 ...........................................
[CV] ............................ kmeans__n_clusters=53, total=   2.6s
[CV] kmeans__n_clusters=53 ...........................................
[CV] ............................ kmeans__n_clusters=53, total=   2.7s
[CV] kmeans__n_clusters=54 ...........................................
[CV] ............................ kmeans__n_clusters=54, total=   2.5s
[CV] kmeans__n_clusters=54 ...........................................
[CV] ............................ kmeans__n_clusters=54, total=   2.5s
[CV] kmeans__n_clusters=54 ...........................................
[CV] ............................ kmeans__n_clusters=54, total=   3.0s
[CV] kmeans__n_clusters=55 ...........................................
[CV] ............................ kmeans__n_clusters=55, total=   2.4s
[CV] kmeans__n_clusters=55 ...........................................
[CV] ............................ kmeans__n_clusters=55, total=   3.0s
[CV] kmeans__n_clusters=55 ...........................................
[CV] ............................ kmeans__n_clusters=55, total=   2.8s
[CV] kmeans__n_clusters=56 ...........................................
[CV] ............................ kmeans__n_clusters=56, total=   2.8s
[CV] kmeans__n_clusters=56 ...........................................
[CV] ............................ kmeans__n_clusters=56, total=   2.7s
[CV] kmeans__n_clusters=56 ...........................................
[CV] ............................ kmeans__n_clusters=56, total=   2.7s
[CV] kmeans__n_clusters=57 ...........................................
[CV] ............................ kmeans__n_clusters=57, total=   2.6s
[CV] kmeans__n_clusters=57 ...........................................
[CV] ............................ kmeans__n_clusters=57, total=   3.2s
[CV] kmeans__n_clusters=57 ...........................................
[CV] ............................ kmeans__n_clusters=57, total=   2.7s
[CV] kmeans__n_clusters=58 ...........................................
[CV] ............................ kmeans__n_clusters=58, total=   2.8s
[CV] kmeans__n_clusters=58 ...........................................
[CV] ............................ kmeans__n_clusters=58, total=   2.7s
[CV] kmeans__n_clusters=58 ...........................................
[CV] ............................ kmeans__n_clusters=58, total=   2.7s
[CV] kmeans__n_clusters=59 ...........................................
[CV] ............................ kmeans__n_clusters=59, total=   2.8s
[CV] kmeans__n_clusters=59 ...........................................
[CV] ............................ kmeans__n_clusters=59, total=   2.8s
[CV] kmeans__n_clusters=59 ...........................................
[CV] ............................ kmeans__n_clusters=59, total=   2.8s
[CV] kmeans__n_clusters=60 ...........................................
[CV] ............................ kmeans__n_clusters=60, total=   2.9s
[CV] kmeans__n_clusters=60 ...........................................
[CV] ............................ kmeans__n_clusters=60, total=   3.0s
[CV] kmeans__n_clusters=60 ...........................................
[CV] ............................ kmeans__n_clusters=60, total=   2.9s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   3.0s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   3.0s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   2.4s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   2.9s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   3.0s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   2.7s
[CV] kmeans__n_clusters=63 ...........................................
[CV] ............................ kmeans__n_clusters=63, total=   3.0s
[CV] kmeans__n_clusters=63 ...........................................
[CV] ............................ kmeans__n_clusters=63, total=   3.0s
[CV] kmeans__n_clusters=63 ...........................................
[CV] ............................ kmeans__n_clusters=63, total=   2.8s
[CV] kmeans__n_clusters=64 ...........................................
[CV] ............................ kmeans__n_clusters=64, total=   3.3s
[CV] kmeans__n_clusters=64 ...........................................
[CV] ............................ kmeans__n_clusters=64, total=   3.3s
[CV] kmeans__n_clusters=64 ...........................................
[CV] ............................ kmeans__n_clusters=64, total=   2.7s
[CV] kmeans__n_clusters=65 ...........................................
[CV] ............................ kmeans__n_clusters=65, total=   2.8s
[CV] kmeans__n_clusters=65 ...........................................
[CV] ............................ kmeans__n_clusters=65, total=   3.0s
[CV] kmeans__n_clusters=65 ...........................................
[CV] ............................ kmeans__n_clusters=65, total=   2.7s
[CV] kmeans__n_clusters=66 ...........................................
[CV] ............................ kmeans__n_clusters=66, total=   3.2s
[CV] kmeans__n_clusters=66 ...........................................
[CV] ............................ kmeans__n_clusters=66, total=   2.8s
[CV] kmeans__n_clusters=66 ...........................................
[CV] ............................ kmeans__n_clusters=66, total=   2.6s
[CV] kmeans__n_clusters=67 ...........................................
[CV] ............................ kmeans__n_clusters=67, total=   2.7s
[CV] kmeans__n_clusters=67 ...........................................
[CV] ............................ kmeans__n_clusters=67, total=   2.7s
[CV] kmeans__n_clusters=67 ...........................................
[CV] ............................ kmeans__n_clusters=67, total=   2.8s
[CV] kmeans__n_clusters=68 ...........................................
[CV] ............................ kmeans__n_clusters=68, total=   2.9s
[CV] kmeans__n_clusters=68 ...........................................
[CV] ............................ kmeans__n_clusters=68, total=   2.8s
[CV] kmeans__n_clusters=68 ...........................................
[CV] ............................ kmeans__n_clusters=68, total=   2.8s
[CV] kmeans__n_clusters=69 ...........................................
[CV] ............................ kmeans__n_clusters=69, total=   2.8s
[CV] kmeans__n_clusters=69 ...........................................
[CV] ............................ kmeans__n_clusters=69, total=   2.8s
[CV] kmeans__n_clusters=69 ...........................................
[CV] ............................ kmeans__n_clusters=69, total=   3.0s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   2.6s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   2.8s
[CV] kmeans__n_clusters=70 ...........................................
[CV] ............................ kmeans__n_clusters=70, total=   2.9s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=   2.6s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=   3.3s
[CV] kmeans__n_clusters=71 ...........................................
[CV] ............................ kmeans__n_clusters=71, total=   2.9s
[CV] kmeans__n_clusters=72 ...........................................
[CV] ............................ kmeans__n_clusters=72, total=   2.6s
[CV] kmeans__n_clusters=72 ...........................................
[CV] ............................ kmeans__n_clusters=72, total=   2.6s
[CV] kmeans__n_clusters=72 ...........................................
[CV] ............................ kmeans__n_clusters=72, total=   2.8s
[CV] kmeans__n_clusters=73 ...........................................
[CV] ............................ kmeans__n_clusters=73, total=   2.6s
[CV] kmeans__n_clusters=73 ...........................................
[CV] ............................ kmeans__n_clusters=73, total=   2.8s
[CV] kmeans__n_clusters=73 ...........................................
[CV] ............................ kmeans__n_clusters=73, total=   2.6s
[CV] kmeans__n_clusters=74 ...........................................
[CV] ............................ kmeans__n_clusters=74, total=   2.8s
[CV] kmeans__n_clusters=74 ...........................................
[CV] ............................ kmeans__n_clusters=74, total=   3.2s
[CV] kmeans__n_clusters=74 ...........................................
[CV] ............................ kmeans__n_clusters=74, total=   2.9s
[CV] kmeans__n_clusters=75 ...........................................
[CV] ............................ kmeans__n_clusters=75, total=   2.7s
[CV] kmeans__n_clusters=75 ...........................................
[CV] ............................ kmeans__n_clusters=75, total=   3.0s
[CV] kmeans__n_clusters=75 ...........................................
[CV] ............................ kmeans__n_clusters=75, total=   2.7s
[CV] kmeans__n_clusters=76 ...........................................
[CV] ............................ kmeans__n_clusters=76, total=   2.4s
[CV] kmeans__n_clusters=76 ...........................................
[CV] ............................ kmeans__n_clusters=76, total=   2.8s
[CV] kmeans__n_clusters=76 ...........................................
[CV] ............................ kmeans__n_clusters=76, total=   2.8s
[CV] kmeans__n_clusters=77 ...........................................
[CV] ............................ kmeans__n_clusters=77, total=   2.6s
[CV] kmeans__n_clusters=77 ...........................................
[CV] ............................ kmeans__n_clusters=77, total=   3.0s
[CV] kmeans__n_clusters=77 ...........................................
[CV] ............................ kmeans__n_clusters=77, total=   3.1s
[CV] kmeans__n_clusters=78 ...........................................
[CV] ............................ kmeans__n_clusters=78, total=   2.7s
[CV] kmeans__n_clusters=78 ...........................................
[CV] ............................ kmeans__n_clusters=78, total=   2.8s
[CV] kmeans__n_clusters=78 ...........................................
[CV] ............................ kmeans__n_clusters=78, total=   2.7s
[CV] kmeans__n_clusters=79 ...........................................
[CV] ............................ kmeans__n_clusters=79, total=   2.9s
[CV] kmeans__n_clusters=79 ...........................................
[CV] ............................ kmeans__n_clusters=79, total=   2.8s
[CV] kmeans__n_clusters=79 ...........................................
[CV] ............................ kmeans__n_clusters=79, total=   3.0s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   2.7s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   3.0s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   2.6s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   2.4s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   2.7s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   2.8s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   2.5s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   2.9s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   2.9s
[CV] kmeans__n_clusters=83 ...........................................
[CV] ............................ kmeans__n_clusters=83, total=   2.7s
[CV] kmeans__n_clusters=83 ...........................................
[CV] ............................ kmeans__n_clusters=83, total=   2.8s
[CV] kmeans__n_clusters=83 ...........................................
[CV] ............................ kmeans__n_clusters=83, total=   2.9s
[CV] kmeans__n_clusters=84 ...........................................
[CV] ............................ kmeans__n_clusters=84, total=   2.7s
[CV] kmeans__n_clusters=84 ...........................................
[CV] ............................ kmeans__n_clusters=84, total=   2.8s
[CV] kmeans__n_clusters=84 ...........................................
[CV] ............................ kmeans__n_clusters=84, total=   2.8s
[CV] kmeans__n_clusters=85 ...........................................
[CV] ............................ kmeans__n_clusters=85, total=   2.6s
[CV] kmeans__n_clusters=85 ...........................................
[CV] ............................ kmeans__n_clusters=85, total=   3.1s
[CV] kmeans__n_clusters=85 ...........................................
[CV] ............................ kmeans__n_clusters=85, total=   2.8s
[CV] kmeans__n_clusters=86 ...........................................
[CV] ............................ kmeans__n_clusters=86, total=   2.8s
[CV] kmeans__n_clusters=86 ...........................................
[CV] ............................ kmeans__n_clusters=86, total=   2.9s
[CV] kmeans__n_clusters=86 ...........................................
[CV] ............................ kmeans__n_clusters=86, total=   2.7s
[CV] kmeans__n_clusters=87 ...........................................
[CV] ............................ kmeans__n_clusters=87, total=   2.8s
[CV] kmeans__n_clusters=87 ...........................................
[CV] ............................ kmeans__n_clusters=87, total=   2.8s
[CV] kmeans__n_clusters=87 ...........................................
[CV] ............................ kmeans__n_clusters=87, total=   2.9s
[CV] kmeans__n_clusters=88 ...........................................
[CV] ............................ kmeans__n_clusters=88, total=   2.8s
[CV] kmeans__n_clusters=88 ...........................................
[CV] ............................ kmeans__n_clusters=88, total=   2.6s
[CV] kmeans__n_clusters=88 ...........................................
[CV] ............................ kmeans__n_clusters=88, total=   3.0s
[CV] kmeans__n_clusters=89 ...........................................
[CV] ............................ kmeans__n_clusters=89, total=   2.9s
[CV] kmeans__n_clusters=89 ...........................................
[CV] ............................ kmeans__n_clusters=89, total=   3.0s
[CV] kmeans__n_clusters=89 ...........................................
[CV] ............................ kmeans__n_clusters=89, total=   2.8s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=   3.3s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=   2.8s
[CV] kmeans__n_clusters=90 ...........................................
[CV] ............................ kmeans__n_clusters=90, total=   2.9s
[CV] kmeans__n_clusters=91 ...........................................
[CV] ............................ kmeans__n_clusters=91, total=   2.8s
[CV] kmeans__n_clusters=91 ...........................................
[CV] ............................ kmeans__n_clusters=91, total=   3.0s
[CV] kmeans__n_clusters=91 ...........................................
[CV] ............................ kmeans__n_clusters=91, total=   3.0s
[CV] kmeans__n_clusters=92 ...........................................
[CV] ............................ kmeans__n_clusters=92, total=   2.6s
[CV] kmeans__n_clusters=92 ...........................................
[CV] ............................ kmeans__n_clusters=92, total=   2.6s
[CV] kmeans__n_clusters=92 ...........................................
[CV] ............................ kmeans__n_clusters=92, total=   2.7s
[CV] kmeans__n_clusters=93 ...........................................
[CV] ............................ kmeans__n_clusters=93, total=   2.7s
[CV] kmeans__n_clusters=93 ...........................................
[CV] ............................ kmeans__n_clusters=93, total=   2.9s
[CV] kmeans__n_clusters=93 ...........................................
[CV] ............................ kmeans__n_clusters=93, total=   2.6s
[CV] kmeans__n_clusters=94 ...........................................
[CV] ............................ kmeans__n_clusters=94, total=   2.7s
[CV] kmeans__n_clusters=94 ...........................................
[CV] ............................ kmeans__n_clusters=94, total=   3.1s
[CV] kmeans__n_clusters=94 ...........................................
[CV] ............................ kmeans__n_clusters=94, total=   2.5s
[CV] kmeans__n_clusters=95 ...........................................
[CV] ............................ kmeans__n_clusters=95, total=   2.8s
[CV] kmeans__n_clusters=95 ...........................................
[CV] ............................ kmeans__n_clusters=95, total=   2.8s
[CV] kmeans__n_clusters=95 ...........................................
[CV] ............................ kmeans__n_clusters=95, total=   2.9s
[CV] kmeans__n_clusters=96 ...........................................
[CV] ............................ kmeans__n_clusters=96, total=   2.7s
[CV] kmeans__n_clusters=96 ...........................................
[CV] ............................ kmeans__n_clusters=96, total=   2.7s
[CV] kmeans__n_clusters=96 ...........................................
[CV] ............................ kmeans__n_clusters=96, total=   3.1s
[CV] kmeans__n_clusters=97 ...........................................
[CV] ............................ kmeans__n_clusters=97, total=   2.9s
[CV] kmeans__n_clusters=97 ...........................................
[CV] ............................ kmeans__n_clusters=97, total=   3.1s
[CV] kmeans__n_clusters=97 ...........................................
[CV] ............................ kmeans__n_clusters=97, total=   2.6s
[CV] kmeans__n_clusters=98 ...........................................
[CV] ............................ kmeans__n_clusters=98, total=   2.8s
[CV] kmeans__n_clusters=98 ...........................................
[CV] ............................ kmeans__n_clusters=98, total=   2.8s
[CV] kmeans__n_clusters=98 ...........................................
[CV] ............................ kmeans__n_clusters=98, total=   3.1s
[CV] kmeans__n_clusters=99 ...........................................
[CV] ............................ kmeans__n_clusters=99, total=   2.6s
[CV] kmeans__n_clusters=99 ...........................................
[CV] ............................ kmeans__n_clusters=99, total=   2.7s
[CV] kmeans__n_clusters=99 ...........................................
[CV] ............................ kmeans__n_clusters=99, total=   2.7s

[Parallel(n_jobs=1)]: Done 294 out of 294 | elapsed: 10.9min finished

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kmeans',
                                        KMeans(n_clusters=50, random_state=42)),
                                       ('log_reg',
                                        LogisticRegression(max_iter=5000,
                                                           multi_class='ovr',
                                                           random_state=42))]),
             param_grid={'kmeans__n_clusters': range(2, 100)}, verbose=2)


grid_clf.best_params_

{'kmeans__n_clusters': 57}


grid_clf.score(X_test, y_test)

0.98


n_labeled = 50


log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", random_state=42)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

log_reg.score(X_test, y_test)

0.8333333333333334


k = 50

kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)


representative_digit_idx = np.argmin(X_digits_dist, axis=0)  # 50개의 대표이미지 인덱스 확인
X_representative_digits = X_train[representative_digit_idx]  # 50개의 대표이미지 지정


plt.figure(figsize=(8, 2))

for index, X_representative_digit in enumerate(X_representative_digits):
    plt.subplot(k // 10, 10, index + 1)
    plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear")
    plt.axis('off')

save_fig("representative_images_diagram", tight_layout=False)
plt.show()

Saving figure representative_images_diagram


y_train[representative_digit_idx]

array([0, 1, 3, 2, 7, 6, 4, 6, 9, 5, 1, 2, 9, 5, 2, 7, 8, 1, 8, 6, 3, 1,
       5, 4, 5, 4, 0, 3, 2, 6, 1, 7, 7, 9, 1, 8, 6, 5, 4, 8, 5, 3, 3, 6,
       7, 9, 7, 8, 4, 9])


y_representative_digits = y_train[representative_digit_idx]


log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.9244444444444444


y_train_propagated = np.empty(len(X_train), dtype=np.int32)

for i in range(k):
    y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]


log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train_propagated)

log_reg.score(X_test, y_test)

0.9377777777777778


X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]


percentile_closest = 20

for i in range(k):
    in_cluster = (kmeans.labels_ == i)                 # 군집별 샘플 대상
    cluster_dist = X_cluster_dist[in_cluster]
    cutoff_distance = np.percentile(cluster_dist, percentile_closest)   # 군집별 센트로이드 근접도 상위 20% 경곗값
    above_cutoff = (X_cluster_dist > cutoff_distance)  # 군집별 센트로이드 근접도 상위 20% 이내 샘플 대상
    X_cluster_dist[in_cluster & above_cutoff] = -1


partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train_propagated[partially_propagated]


log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)

log_reg.score(X_test, y_test)

0.9222222222222223


np.mean(y_train_partially_propagated == y_train[partially_propagated])

0.9896193771626297


np.mean(y_train_propagated == y_train)

0.9435783221974758


from sklearn.datasets import make_moons


X, y = make_moons(n_samples=1000, noise=0.05, random_state=42)


from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.05)


np.unique(dbscan.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6], dtype=int64)


dbscan.labels_[:10]

array([ 0,  2, -1, -1,  1,  0,  0,  0,  2,  5], dtype=int64)


len(dbscan.core_sample_indices_)

808


dbscan.core_sample_indices_[:10]

array([ 0,  4,  5,  6,  7,  8, 10, 11, 12, 13], dtype=int64)


dbscan.components_[:3]

array([[-0.02137124,  0.40618608],
       [-0.84192557,  0.53058695],
       [ 0.58930337, -0.32137599]])


dbscan2 = DBSCAN(eps=0.2)
dbscan2.fit(X)

DBSCAN(eps=0.2)


np.unique(dbscan2.labels_)

array([0, 1], dtype=int64)


def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):
    
    # 코어 샘플 구분용 마스크 설정
    core_mask = np.zeros_like(dbscan.labels_, dtype=bool)
    core_mask[dbscan.core_sample_indices_] = True
    
    # 이상치 구분용 마스크 설정
    anomalies_mask = dbscan.labels_ == -1
    
    # 기타 샘플용 마스크 설정
    non_core_mask = ~(core_mask | anomalies_mask)

    # 핵심 샘플
    cores = dbscan.components_
    # 이상치
    anomalies = X[anomalies_mask]
    # 기타
    non_cores = X[non_core_mask]
    
    # 핵심 샘플 산점도: 각 샘플과 함께 반경도 함께 표현
    plt.scatter(cores[:, 0], cores[:, 1],
                c=dbscan.labels_[core_mask], marker='o', s=size, cmap="Paired")
    plt.scatter(cores[:, 0], cores[:, 1], marker='*', s=20, c=dbscan.labels_[core_mask])
    
    # 이상치 산점도: 빨강 X 표시
    plt.scatter(anomalies[:, 0], anomalies[:, 1], c="r", marker="x", s=100)
    
    # 기타 샘플 산점도: 책과는 달리 명확한 구분을 위해 검은 점으로 표시
#     plt.scatter(non_cores[:, 0], non_cores[:, 1], c=dbscan.labels_[non_core_mask], marker=".")
    plt.scatter(non_cores[:, 0], non_cores[:, 1], c="k", marker=".")
    
    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)
    
    plt.title("eps={:.2f}, min_samples={}".format(dbscan.eps, dbscan.min_samples), fontsize=14)


# 핵심 샘플 수
len(dbscan.components_)

808


# 이상치 수
(dbscan.labels_==-1).sum()

77


plt.figure(figsize=(9, 3.2))

# 왼편 그림: 반경 0.05
plt.subplot(121)
plot_dbscan(dbscan, X, size=100)

# 오른편 그림: 반경 0.2
plt.subplot(122)
plot_dbscan(dbscan2, X, size=600, show_ylabels=False)

save_fig("dbscan_plot")
plt.show()

Saving figure dbscan_plot


dbscan = dbscan2


from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)


X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)

array([1, 0, 1, 0], dtype=int64)


knn.predict_proba(X_new)

array([[0.18, 0.82],
       [1.  , 0.  ],
       [0.12, 0.88],
       [1.  , 0.  ]])


plt.figure(figsize=(6, 3))

# 결정경계 그리기: knn 모델의 predict() 메서드 활용
plot_decision_boundaries(knn, X, show_centroids=False)

# 네 개의 샘플 표기: 파랑 + 기호
plt.scatter(X_new[:, 0], X_new[:, 1], c="b", marker="+", s=200, zorder=10)

save_fig("cluster_classification_plot")
plt.show()

Saving figure cluster_classification_plot


y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[y_pred_idx]   # 가장 가까운 샘플의 (클래스) 레이블
y_pred[y_dist > 0.2] = -1             # 거리가 0.2 이상인 경우 이상치 처리
y_pred.ravel()

array([-1,  0,  1, -1], dtype=int64)


# 훈련 세트
data, _ = make_blobs(1000, centers=5)

# hdbscan 모델 훈련
from hdbscan import HDBSCAN

clusterer = HDBSCAN(min_cluster_size=10, gen_min_span_tree=True)
clusterer.fit(data)

# 생성된 군집
np.unique(clusterer.labels_)

array([-1,  0,  1,  2,  3,  4], dtype=int64)


from sklearn.cluster import SpectralClustering


sc1 = SpectralClustering(n_clusters=2, gamma=100, random_state=42)
sc1.fit(X)

SpectralClustering(gamma=100, n_clusters=2, random_state=42)


sc2 = SpectralClustering(n_clusters=2, gamma=1, random_state=42)
sc2.fit(X)

SpectralClustering(gamma=1, n_clusters=2, random_state=42)


def plot_spectral_clustering(sc, X, size, alpha, show_xlabels=True, show_ylabels=True):
    plt.scatter(X[:, 0], X[:, 1], marker='o', s=size, c='gray', cmap="Paired", alpha=alpha)
    plt.scatter(X[:, 0], X[:, 1], marker='o', s=30, c='w')
    plt.scatter(X[:, 0], X[:, 1], marker='.', s=10, c=sc.labels_, cmap="Paired")
    
    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
        
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)
    
    plt.title("RBF gamma={}".format(sc.gamma), fontsize=14)


plt.figure(figsize=(9, 3.2))

plt.subplot(121)
plot_spectral_clustering(sc1, X, size=500, alpha=0.1)

plt.subplot(122)
plot_spectral_clustering(sc2, X, size=4000, alpha=0.01, show_ylabels=False)

plt.show()


from sklearn.cluster import AgglomerativeClustering


X = np.array([0, 2, 5, 8.5]).reshape(-1, 1)
X

array([[0. ],
       [2. ],
       [5. ],
       [8.5]])


agg = AgglomerativeClustering(linkage="complete").fit(X)


def learned_parameters(estimator):
    return [attrib for attrib in dir(estimator)
            if attrib.endswith("_") and not attrib.startswith("_")]


learned_parameters(agg)

['children_',
 'labels_',
 'n_clusters_',
 'n_connected_components_',
 'n_features_in_',
 'n_leaves_']


agg.children_

array([[0, 1],
       [2, 3],
       [4, 5]])


agg.labels_

array([1, 1, 0, 0], dtype=int64)


# 군집 2개
X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))

# 셋째 군집
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]

# 데이터셋 병합
X = np.r_[X1, X2]
y = np.r_[y1, y2]


from sklearn.mixture import GaussianMixture


gm = GaussianMixture(n_components=3, n_init=10, random_state=42)
gm.fit(X)

GaussianMixture(n_components=3, n_init=10, random_state=42)


gm.weights_

array([0.39054348, 0.2093669 , 0.40008962])


gm.means_

array([[ 0.05224874,  0.07631976],
       [ 3.40196611,  1.05838748],
       [-1.40754214,  1.42716873]])


gm.covariances_

array([[[ 0.6890309 ,  0.79717058],
        [ 0.79717058,  1.21367348]],

       [[ 1.14296668, -0.03114176],
        [-0.03114176,  0.9545003 ]],

       [[ 0.63496849,  0.7298512 ],
        [ 0.7298512 ,  1.16112807]]])


gm.converged_

True


gm.n_iter_

4


gm.predict(X)

array([0, 0, 2, ..., 1, 1, 1], dtype=int64)


gm.predict_proba(X)

array([[9.77227791e-01, 2.27715290e-02, 6.79898914e-07],
       [9.83288385e-01, 1.60345103e-02, 6.77104389e-04],
       [7.51824662e-05, 1.90251273e-06, 9.99922915e-01],
       ...,
       [4.35053542e-07, 9.99999565e-01, 2.17938894e-26],
       [5.27837047e-16, 1.00000000e+00, 1.50679490e-41],
       [2.32355608e-15, 1.00000000e+00, 8.21915701e-41]])


X_new, y_new = gm.sample(6)


X_new

array([[-0.8690223 , -0.32680051],
       [ 0.29945755,  0.2841852 ],
       [ 1.85027284,  2.06556913],
       [ 3.98260019,  1.50041446],
       [ 3.82006355,  0.53143606],
       [-1.04015332,  0.7864941 ]])


y_new

array([0, 0, 1, 1, 1, 2])


from  collections import Counter

X_new, y_new = gm.sample(1000)
Counter(y_new)

Counter({0: 377, 1: 207, 2: 416})


gm.score_samples(X)

array([-2.60674489, -3.57074133, -3.33007348, ..., -3.51379355,
       -4.39643283, -3.8055665 ])


np.exp(gm.score_samples(X_new))

array([1.31639021e-01, 2.96960925e-02, 1.07634613e-01, 1.12503697e-01,
       2.17324126e-02, 3.22192929e-02, 8.05586623e-02, 3.71389975e-02,
       4.61959839e-02, 5.03675976e-02, 1.19512749e-01, 6.84431993e-02,
       1.11923123e-01, 2.11008615e-02, 7.95154261e-02, 4.69492918e-02,
       2.00451856e-02, 5.88440087e-02, 1.04078068e-01, 1.31913878e-01,
       4.09799771e-02, 7.16252879e-02, 2.77195832e-02, 1.22260905e-01,
       9.21260157e-02, 5.28444834e-02, 9.45852830e-02, 8.16135629e-02,
       1.22196182e-01, 4.09359870e-02, 3.97613059e-02, 8.37120648e-02,
       1.05571293e-01, 3.98534829e-02, 4.08397392e-02, 4.69898979e-03,
       1.32262839e-01, 1.94025472e-02, 1.27262102e-01, 4.06930201e-02,
       8.98815435e-02, 8.64395294e-02, 1.06151021e-01, 8.64172062e-02,
       1.03952537e-01, 4.49821725e-02, 1.28355206e-01, 1.35095683e-01,
       5.06827431e-02, 9.54188832e-02, 1.26366536e-01, 2.30658745e-02,
       1.33871902e-01, 2.61271081e-02, 7.32032328e-03, 1.30327530e-01,
       7.02991897e-02, 5.44200670e-02, 6.71002120e-02, 1.94130708e-02,
       1.08225865e-02, 7.48081004e-02, 1.21717881e-01, 4.49531906e-02,
       7.19993064e-02, 2.78941892e-02, 9.83459656e-02, 4.67202404e-02,
       5.75600875e-02, 4.07065545e-02, 9.88258160e-02, 3.23333144e-02,
       1.15844074e-01, 1.26643682e-01, 1.07968142e-01, 1.03022451e-01,
       2.17502834e-02, 5.65677690e-02, 6.43129760e-02, 5.06291792e-02,
       8.00954728e-02, 1.64389983e-02, 1.01760062e-01, 6.92906929e-02,
       1.30624545e-01, 9.48543873e-02, 4.82939527e-02, 4.70852576e-03,
       7.90077230e-02, 7.10714710e-02, 1.05029000e-01, 1.23714632e-01,
       3.37920638e-02, 8.73148986e-02, 6.25110496e-02, 1.26930080e-01,
       9.40790293e-02, 7.22539193e-02, 1.11208350e-01, 4.42892368e-02,
       3.97957413e-02, 1.06477900e-01, 1.72069430e-02, 6.18610325e-02,
       7.11826269e-02, 9.91845008e-02, 1.01391815e-01, 1.23443829e-01,
       1.66575678e-03, 2.98764885e-02, 6.88699387e-02, 7.76281260e-02,
       8.59185100e-02, 1.04705030e-01, 1.12224901e-01, 1.13960863e-02,
       2.11069854e-02, 7.80096361e-02, 1.01958807e-01, 8.39568310e-02,
       6.68137224e-02, 1.00922912e-01, 2.68972249e-02, 4.25403227e-02,
       8.56519721e-03, 4.35069292e-02, 4.84010225e-02, 9.31380291e-02,
       9.17448913e-02, 2.12547553e-03, 6.22606428e-02, 1.31630095e-02,
       1.25319848e-01, 1.75134585e-02, 7.05218374e-02, 7.83073716e-02,
       1.13664116e-01, 1.28604059e-01, 1.10753453e-01, 1.83316178e-02,
       2.12166899e-03, 1.15488101e-01, 1.09908719e-01, 1.20554088e-01,
       8.17153845e-02, 1.02422586e-01, 8.82884605e-02, 8.17683046e-02,
       8.58936070e-02, 8.70940585e-02, 1.37865548e-01, 5.14832817e-02,
       1.17035495e-01, 7.42110145e-02, 7.08757813e-02, 5.91506493e-02,
       1.04780054e-01, 1.30547428e-01, 1.15720611e-01, 1.15354087e-02,
       4.35641955e-02, 5.18566197e-02, 9.37231999e-02, 9.29608832e-02,
       1.10121201e-01, 8.53039368e-02, 9.52599426e-02, 1.08702053e-01,
       9.71384989e-02, 1.18452491e-01, 1.21338447e-01, 2.51502386e-02,
       1.05916386e-01, 4.45451847e-02, 9.48121166e-02, 8.41433491e-02,
       1.33196408e-01, 9.35812882e-02, 3.74735263e-02, 1.27357125e-01,
       1.00875977e-01, 1.35322561e-01, 1.02538337e-01, 1.22590951e-01,
       3.05528091e-02, 1.01749888e-02, 9.32413121e-02, 9.09886424e-03,
       8.33510327e-02, 1.81403942e-02, 1.03844965e-01, 2.26027105e-02,
       7.47698416e-02, 8.31849143e-02, 1.16350920e-02, 6.19823739e-02,
       1.42712741e-02, 6.41501433e-02, 3.77177176e-02, 1.38685829e-01,
       1.03610964e-01, 7.98530239e-02, 1.20709617e-01, 5.72274564e-02,
       2.03317810e-02, 9.92505808e-02, 4.13766220e-02, 6.90477744e-02,
       3.56119471e-03, 8.40472925e-02, 4.41641619e-02, 6.99056859e-02,
       1.68756618e-02, 1.54625757e-02, 1.03611883e-01, 4.47530472e-02,
       1.42095097e-02, 3.64447614e-02, 1.28815601e-01, 2.05421191e-02,
       4.91330168e-02, 8.34431863e-02, 7.04841435e-02, 8.78251541e-02,
       7.64239844e-02, 8.90343484e-02, 8.10324615e-02, 5.93969807e-02,
       2.01206597e-02, 1.07000853e-01, 1.34736012e-01, 8.62297909e-02,
       1.03006184e-01, 1.03915331e-02, 1.86091088e-02, 3.47602055e-02,
       1.11931430e-01, 6.47917099e-04, 8.74925695e-02, 4.05904004e-02,
       4.02904621e-02, 6.42864775e-02, 2.26597123e-02, 4.64683905e-02,
       9.55988292e-02, 3.76103481e-02, 8.16934549e-02, 3.93575057e-02,
       1.46560698e-02, 4.66527457e-02, 1.00861294e-01, 8.13732923e-02,
       9.82729761e-02, 1.33614898e-01, 8.97721629e-02, 8.87590266e-02,
       7.59104278e-02, 1.04143098e-01, 3.23411263e-02, 9.52732795e-02,
       2.32085553e-02, 4.04335315e-02, 8.91830892e-02, 2.84723677e-02,
       6.67905178e-02, 1.12898178e-01, 9.07123373e-02, 7.07925600e-02,
       5.16893314e-02, 4.26703291e-02, 9.94777271e-03, 1.15058527e-01,
       1.07938202e-01, 1.36636997e-01, 6.79129433e-02, 1.20421450e-01,
       1.12918340e-01, 1.17486929e-01, 1.56164971e-02, 9.00086014e-02,
       6.20369764e-02, 1.78125505e-02, 2.76899438e-02, 3.41035654e-02,
       1.28421893e-01, 8.54256073e-03, 1.00273936e-01, 2.56847632e-02,
       1.20215355e-01, 8.89702352e-03, 1.32359998e-01, 8.78589986e-02,
       6.81257309e-02, 1.23575492e-01, 1.17003103e-01, 7.11052050e-02,
       8.33831506e-02, 1.17457639e-01, 6.80915895e-02, 3.79281060e-02,
       2.19712151e-02, 4.96755971e-02, 1.06691983e-02, 1.11818638e-01,
       7.08814715e-02, 6.91187409e-03, 1.06018864e-01, 1.20522063e-01,
       9.17108814e-02, 9.54992116e-03, 3.42145214e-02, 8.62424441e-02,
       7.61724069e-02, 1.15667945e-02, 1.25346928e-01, 2.55467277e-02,
       9.85915088e-02, 3.17427198e-02, 3.27255709e-02, 4.39964384e-02,
       1.30019164e-01, 6.17028045e-03, 1.06178909e-01, 1.33111496e-02,
       7.31017334e-02, 5.05105640e-03, 1.38788219e-01, 1.34679645e-01,
       1.02326025e-01, 1.19807019e-01, 1.07936784e-01, 4.34636220e-02,
       3.44853575e-03, 7.43894606e-03, 1.22570274e-01, 4.29889144e-02,
       1.67758762e-02, 3.99017096e-02, 5.65344645e-02, 1.07721239e-01,
       5.25533760e-02, 1.25981788e-01, 9.76194968e-02, 6.65688991e-02,
       6.53436746e-02, 1.10915549e-01, 6.02060166e-02, 8.89844399e-02,
       7.01237131e-02, 4.22554600e-02, 3.67334571e-02, 1.39018249e-02,
       4.91362824e-03, 1.35092315e-01, 1.10952175e-01, 7.54981007e-02,
       1.36345545e-01, 1.30062106e-01, 3.07440887e-02, 4.41975669e-02,
       1.34489194e-01, 1.30662178e-01, 8.96135287e-02, 8.44271552e-02,
       3.00146460e-02, 7.30541669e-02, 8.28615531e-02, 2.07199295e-02,
       1.96451639e-02, 1.14893610e-01, 9.32568265e-02, 1.02579796e-01,
       4.44703865e-02, 3.95923427e-02, 5.03137783e-02, 3.67909355e-03,
       1.20760795e-01, 1.72794833e-02, 2.68429499e-02, 5.73899254e-02,
       1.28985106e-02, 6.65032614e-03, 5.07383597e-03, 2.12103710e-02,
       1.63129163e-02, 7.66756593e-03, 2.37629796e-02, 2.62519109e-02,
       1.28811469e-02, 2.46421527e-02, 1.52345785e-02, 2.03384377e-02,
       8.01463013e-03, 2.84821293e-02, 1.36154458e-02, 1.02186939e-02,
       2.52827411e-02, 3.18896717e-02, 1.79930893e-02, 2.85500030e-02,
       2.76863269e-02, 2.77671603e-02, 9.75423653e-03, 2.84737798e-02,
       1.57250870e-02, 6.21731590e-03, 2.07336766e-02, 1.80314714e-02,
       1.03418843e-02, 8.29548291e-02, 1.00515455e-02, 1.07172340e-02,
       2.80824858e-02, 8.47729638e-03, 1.43567465e-02, 3.10343940e-02,
       6.29971689e-03, 1.92410987e-02, 2.53863566e-02, 1.57959707e-02,
       2.78830477e-02, 1.51850839e-03, 2.26190130e-02, 2.25265900e-02,
       1.28470541e-02, 1.05037325e-02, 2.01983648e-02, 2.24288791e-02,
       9.91250286e-03, 5.54050909e-03, 2.45377960e-02, 2.89721941e-02,
       3.58588034e-03, 2.28472659e-02, 9.43792112e-03, 6.37497931e-03,
       2.72246990e-02, 1.57501958e-02, 1.00378285e-01, 1.64153096e-02,
       4.78847168e-03, 2.52315838e-02, 1.45253547e-02, 2.00944105e-02,
       1.05131855e-02, 1.54419938e-02, 3.11370735e-02, 1.10656507e-02,
       2.75997909e-02, 1.40904406e-02, 3.13053275e-02, 9.58380130e-03,
       2.66856093e-03, 2.97178748e-02, 8.91835309e-03, 3.05589846e-02,
       2.60754154e-02, 2.83503369e-02, 1.05916646e-02, 2.47592911e-02,
       4.89939058e-03, 2.54263147e-02, 1.58959056e-02, 4.66284914e-03,
       9.97383347e-04, 2.55411370e-02, 1.47432183e-02, 2.64250948e-02,
       9.61135577e-03, 2.57092140e-02, 7.92612591e-03, 7.84709933e-03,
       3.09533050e-02, 2.91384015e-02, 5.14344085e-03, 1.70402250e-02,
       2.71150199e-02, 2.45492475e-02, 1.08037154e-02, 2.43485959e-02,
       1.12698650e-02, 5.01623358e-03, 1.14588326e-02, 2.51034381e-02,
       1.81170251e-02, 4.84721323e-03, 1.52690050e-03, 6.81847992e-03,
       2.14378600e-02, 1.99746955e-02, 1.63171222e-02, 1.70390902e-02,
       3.11355999e-02, 3.88480787e-03, 2.45331748e-02, 1.85783301e-02,
       2.90343737e-02, 2.30134223e-02, 1.45790605e-02, 2.58441527e-02,
       2.31580548e-02, 1.76283937e-02, 1.59578209e-02, 1.51001648e-02,
       4.08569872e-02, 2.76048534e-02, 2.29629123e-02, 2.77117439e-02,
       2.36149923e-02, 3.21653864e-03, 2.79168060e-02, 3.16646202e-02,
       1.56719341e-02, 7.67461490e-03, 2.39035969e-02, 8.22907921e-03,
       5.15351104e-03, 1.79887201e-02, 1.36443615e-02, 1.79542382e-02,
       2.12806528e-02, 7.44734195e-03, 1.09747196e-02, 2.42673372e-02,
       2.73066728e-02, 2.18930450e-02, 1.60410897e-02, 2.57053427e-02,
       4.09945474e-04, 3.63282212e-03, 1.59684326e-02, 7.03759425e-03,
       2.69429370e-02, 2.84526450e-02, 1.39731906e-02, 1.72091858e-02,
       8.67123273e-03, 1.71676337e-02, 2.82574245e-02, 3.50770635e-03,
       2.70510784e-02, 1.84341112e-02, 3.02742590e-02, 3.18128325e-02,
       2.44636287e-02, 2.43265005e-02, 4.32162344e-03, 1.98369545e-02,
       2.92575117e-04, 5.86934354e-02, 1.00605837e-02, 1.51884888e-02,
       4.20731215e-03, 3.06094091e-02, 1.94149336e-02, 2.95243335e-02,
       6.70359576e-02, 1.80109475e-02, 1.03319056e-02, 1.35435126e-02,
       4.95690715e-03, 2.29396240e-02, 3.01753885e-02, 2.77291582e-02,
       2.21334827e-02, 1.94644087e-02, 7.40785679e-03, 2.08897966e-02,
       1.18528609e-02, 3.13114173e-02, 3.14383216e-02, 6.20492330e-03,
       7.44911116e-03, 1.92158810e-03, 1.08450256e-02, 1.84652724e-02,
       1.99295191e-03, 1.87132571e-02, 1.01815031e-04, 3.13930527e-03,
       1.58168965e-03, 1.56031138e-02, 1.85936079e-02, 2.30146654e-02,
       1.39740537e-01, 1.01864214e-01, 3.34398889e-02, 4.70083915e-02,
       5.25704506e-02, 2.69198896e-02, 5.54237492e-02, 8.47275320e-02,
       7.57217419e-02, 5.75110699e-02, 1.11636357e-01, 3.59294343e-02,
       1.20826800e-01, 5.41447887e-02, 1.27376189e-01, 1.20555113e-01,
       1.40493046e-01, 8.83416372e-02, 1.13942264e-02, 5.11140617e-02,
       7.28713202e-02, 6.18831926e-02, 1.07930999e-01, 9.94899027e-03,
       6.62934793e-02, 1.05711907e-01, 1.37221085e-01, 1.39130616e-01,
       7.68623392e-03, 4.71906262e-02, 5.03206161e-03, 2.65996590e-02,
       2.95652635e-02, 3.17998050e-02, 1.28330591e-01, 7.10493942e-02,
       5.39394464e-02, 1.04604693e-01, 4.08503208e-02, 9.41645334e-02,
       1.79959084e-02, 3.18821324e-02, 3.18370433e-02, 1.09634528e-01,
       1.20111052e-01, 7.96973805e-02, 5.10992637e-02, 9.29925008e-02,
       5.00352279e-02, 6.44677284e-02, 7.37547786e-02, 6.67746446e-02,
       8.84784498e-03, 1.31756847e-01, 1.37914710e-01, 6.93259973e-02,
       4.26517521e-02, 2.29338306e-02, 7.73955773e-03, 5.83661011e-03,
       9.90495900e-02, 1.39621968e-01, 8.84722239e-02, 1.28982521e-01,
       6.88509576e-02, 1.04202712e-01, 8.14215821e-02, 1.40715169e-01,
       6.97497964e-02, 1.00734536e-01, 4.58444205e-02, 5.95510857e-02,
       7.62141300e-02, 1.14892146e-01, 1.22153777e-01, 1.24754488e-01,
       2.00618936e-02, 8.96333985e-02, 5.65780677e-02, 2.07569032e-02,
       4.52980329e-02, 1.31540277e-01, 1.11426105e-01, 1.28679849e-01,
       1.38972636e-01, 6.29847351e-02, 2.97919581e-02, 3.49033156e-03,
       5.39492308e-02, 1.41080480e-02, 9.11420950e-02, 4.82881876e-03,
       1.12308130e-01, 4.63858161e-02, 1.19017058e-01, 8.70628271e-02,
       6.79458449e-02, 9.10450467e-02, 8.62813473e-02, 3.20483646e-02,
       1.50378537e-02, 4.66694213e-02, 6.55399700e-02, 2.21236317e-02,
       9.93203588e-02, 1.14356650e-02, 1.02209842e-01, 1.78275388e-02,
       1.05939298e-01, 1.00659175e-01, 1.83748544e-02, 1.29464170e-01,
       3.54798435e-02, 5.51976035e-02, 7.04728423e-02, 5.38269422e-02,
       6.43765186e-02, 1.66014925e-02, 1.29499741e-01, 4.99200360e-02,
       1.21780477e-02, 7.44172950e-02, 1.35568567e-01, 3.84789648e-02,
       6.85217742e-02, 9.26228037e-02, 1.19324307e-02, 9.25074326e-03,
       1.32211506e-01, 9.62748221e-02, 8.20208954e-02, 1.33391429e-01,
       3.75049136e-02, 9.52607889e-02, 5.67662747e-02, 5.97418544e-02,
       1.05126987e-01, 1.03085542e-01, 6.53834784e-02, 3.15228573e-02,
       4.63581978e-03, 2.55007754e-02, 9.85699100e-02, 4.88546983e-02,
       6.59102426e-02, 9.59376681e-02, 1.00030488e-01, 3.22976613e-02,
       6.23875101e-02, 5.71277272e-02, 1.39503917e-01, 8.38506467e-02,
       1.27977970e-01, 8.62995706e-02, 1.01164752e-01, 2.92568093e-02,
       8.88603293e-02, 1.10407344e-02, 1.98145370e-02, 4.58472521e-03,
       1.18035197e-01, 5.73112741e-03, 8.10268237e-03, 6.99144393e-02,
       8.93307000e-02, 1.00736767e-01, 1.25211835e-01, 6.99924127e-02,
       4.70737508e-02, 7.20154614e-02, 1.91621161e-02, 1.39261305e-01,
       3.67060433e-02, 1.49379960e-02, 5.63897675e-02, 8.66044708e-02,
       7.26187312e-02, 7.24661941e-03, 6.67643131e-02, 2.92274616e-02,
       1.32166221e-01, 3.54503641e-02, 1.07060650e-01, 2.15550070e-03,
       2.92639372e-02, 3.19064975e-02, 1.05213367e-01, 8.17216502e-02,
       4.94541717e-02, 1.37198227e-01, 6.80473875e-02, 1.28259328e-01,
       6.71755247e-02, 4.45385428e-02, 1.06464398e-01, 5.04517020e-03,
       1.03144157e-01, 1.22956660e-01, 3.61322078e-02, 4.67684028e-02,
       5.37363118e-02, 1.26185976e-01, 1.93884514e-02, 7.74265583e-02,
       7.13339499e-02, 8.38830264e-02, 1.00990851e-01, 1.29574599e-01,
       9.95909996e-02, 2.34301151e-03, 8.50575359e-02, 5.21786401e-03,
       1.39970839e-02, 1.07984369e-01, 5.38017544e-02, 8.30431736e-02,
       7.95511619e-02, 3.73322188e-02, 1.29430208e-01, 3.97877518e-02,
       9.50305142e-02, 8.46147414e-04, 1.01490143e-01, 6.64749906e-02,
       4.97350663e-02, 7.38084203e-02, 1.35894526e-02, 5.60716539e-02,
       9.76371395e-02, 8.74913062e-02, 6.73364148e-02, 1.22695402e-01,
       8.15635650e-02, 9.49155351e-03, 5.19195199e-02, 1.26682447e-01,
       7.30528023e-02, 8.51003802e-02, 2.43343914e-03, 1.12855728e-01,
       1.29790393e-01, 1.27263092e-01, 5.82619500e-02, 4.43001535e-02,
       5.87212831e-02, 1.23465276e-02, 6.06918497e-02, 7.52412937e-02,
       8.51478100e-02, 1.27915903e-01, 8.52116301e-02, 3.62523460e-02,
       2.10990815e-02, 7.79220569e-02, 1.29720117e-01, 8.87910777e-02,
       3.47046495e-02, 1.23844336e-01, 1.30782003e-01, 1.05879342e-01,
       3.89357118e-02, 2.13160926e-02, 6.78628083e-02, 1.21388265e-01,
       1.17454059e-01, 6.21789267e-02, 1.25724571e-01, 1.84712467e-02,
       8.67990256e-02, 7.57250412e-02, 9.28348832e-03, 1.21488991e-01,
       1.30832811e-01, 9.95525232e-02, 1.19324889e-01, 8.86689340e-02,
       5.04007729e-02, 8.79773000e-02, 2.42947406e-02, 1.29181163e-01,
       1.34428846e-01, 2.77079242e-02, 4.57300796e-02, 2.07496009e-02,
       1.38030947e-01, 7.15028263e-02, 8.90178221e-02, 7.71780800e-02,
       8.37765540e-02, 2.89921848e-03, 2.13562920e-02, 4.80881593e-02,
       8.85655574e-02, 1.61666026e-02, 2.78560371e-02, 3.89322002e-02,
       9.57208641e-02, 1.31340008e-02, 1.20002662e-01, 3.34356235e-02,
       1.13797718e-01, 4.47497027e-02, 1.16675726e-01, 1.01026261e-01,
       5.22022869e-02, 3.40005319e-02, 4.85141320e-02, 1.01145975e-01,
       5.44797839e-02, 2.41163856e-02, 1.10941766e-01, 1.03723775e-01,
       9.74678053e-02, 7.88893048e-02, 2.30592128e-02, 3.48783321e-02,
       7.99527193e-03, 1.19353734e-01, 2.70230434e-02, 1.23933773e-01,
       2.56714658e-02, 9.12943831e-02, 1.07626721e-02, 8.45813835e-02,
       1.39158355e-01, 1.11128038e-01, 1.25875532e-01, 2.87135750e-02,
       1.40069978e-01, 1.06611371e-01, 8.57737987e-02, 1.09324001e-01,
       7.27733347e-02, 1.37685583e-01, 2.38374008e-02, 1.20583535e-01,
       8.19722413e-02, 4.34252539e-02, 8.03182806e-02, 2.36300568e-02,
       4.52604723e-02, 6.81508481e-02, 5.17501368e-02, 1.30531583e-01,
       1.00428893e-01, 2.23152558e-02, 1.51163407e-02, 4.37880176e-02,
       7.13731731e-02, 7.46615160e-02, 9.90674018e-02, 2.25277307e-02,
       1.17603357e-01, 2.47052764e-02, 2.39662130e-02, 5.51633250e-02,
       1.35311745e-01, 9.27187093e-02, 1.25817726e-01, 1.21259874e-01,
       1.27227661e-01, 1.30951832e-01, 3.79341435e-02, 1.15146027e-01,
       1.04436538e-01, 6.60716583e-02, 1.26620386e-01, 2.17821380e-02,
       6.83307843e-02, 2.60390922e-03, 1.01310665e-01, 2.40263389e-02,
       8.51419253e-02, 1.24166830e-01, 1.50139087e-02, 7.69576860e-02,
       1.07364694e-02, 5.51270131e-03, 1.12858511e-02, 5.84411374e-02,
       1.25629173e-01, 1.38041295e-03, 1.22194051e-01, 7.77036376e-02,
       1.24341623e-01, 1.28005052e-01, 1.08384921e-01, 1.31058087e-01,
       7.45785700e-02, 1.31879683e-02, 7.54549781e-02, 4.58437682e-02,
       1.04862247e-04, 7.86197212e-02, 1.34949200e-01, 9.98064938e-02,
       1.20158187e-01, 1.36493087e-01, 7.26361476e-02, 4.16200759e-03,
       1.11397521e-01, 1.30521296e-01, 4.77725085e-02, 9.61293215e-02,
       3.39511486e-02, 7.97535456e-02, 7.83602190e-02, 9.47334089e-02,
       1.04393814e-01, 7.68890205e-02, 5.40313026e-02, 7.46492753e-02,
       9.53936240e-02, 1.05264993e-01, 1.10925161e-01, 9.80903996e-02])


resolution = 100
grid = np.arange(-10, 10, 1 / resolution)
xx, yy = np.meshgrid(grid, grid)

X_full = np.vstack([xx.ravel(), yy.ravel()]).T


# exp() 함수를 적용하여 log() 함수를 상쇄시킴
pdf = np.exp(gm.score_samples(X_full))

# 격자의 크기를 확률밀도와 곱하기
pdf_probas = pdf * (1 / resolution) ** 2
pdf_probas.sum()

0.9999999999271592


from matplotlib.colors import LogNorm

def plot_gaussian_mixture(clusterer, X, resolution=1000, show_ylabels=True):
    # 로그밀도 등고선 그리기
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    # score_samples가 기본적으로 음수이기에 양수로 변환함
    Z = -clusterer.score_samples(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z,
                 norm=LogNorm(vmin=1.0, vmax=30.0),
                 levels=np.logspace(0, 2, 12))
    plt.contour(xx, yy, Z,
                norm=LogNorm(vmin=1.0, vmax=30.0),
                levels=np.logspace(0, 2, 12),
                linewidths=1, colors='k')

    # 결정경계 그리기: 빨강 파선
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z, linewidths=2, colors='r', linestyles='dashed')
    
    # 데이터 산점도 및 센트로이드 그리기
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
    plot_centroids(clusterer.means_, clusterer.weights_)

    plt.xlabel("$x_1$", fontsize=14)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)


plt.figure(figsize=(8, 4))

plot_gaussian_mixture(gm, X)

save_fig("gaussian_mixtures_plot")
plt.show()

Saving figure gaussian_mixtures_plot


gm_full = GaussianMixture(n_components=3, n_init=10, covariance_type="full", random_state=42)
gm_tied = GaussianMixture(n_components=3, n_init=10, covariance_type="tied", random_state=42)
gm_spherical = GaussianMixture(n_components=3, n_init=10, covariance_type="spherical", random_state=42)
gm_diag = GaussianMixture(n_components=3, n_init=10, covariance_type="diag", random_state=42)
gm_full.fit(X)
gm_tied.fit(X)
gm_spherical.fit(X)
gm_diag.fit(X)

GaussianMixture(covariance_type='diag', n_components=3, n_init=10,
                random_state=42)


def compare_gaussian_mixtures(gm1, gm2, X):
    plt.figure(figsize=(9, 4))

    plt.subplot(121)
    plot_gaussian_mixture(gm1, X)
    plt.title('covariance_type="{}"'.format(gm1.covariance_type), fontsize=14)

    plt.subplot(122)
    plot_gaussian_mixture(gm2, X, show_ylabels=False)
    plt.title('covariance_type="{}"'.format(gm2.covariance_type), fontsize=14)


compare_gaussian_mixtures(gm_tied, gm_spherical, X)

save_fig("covariance_type_plot")
plt.show()

Saving figure covariance_type_plot


compare_gaussian_mixtures(gm_full, gm_diag, X)
plt.tight_layout()
plt.show()


densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 4)  # 4%를 이상치로 처리하는 밀도 임곗값
anomalies = X[densities < density_threshold]     # 이상치


plt.figure(figsize=(8, 4))

plot_gaussian_mixture(gm, X)
plt.scatter(anomalies[:, 0], anomalies[:, 1], color='r', marker='*')  # 이상치 표시: 빨강 별표
plt.ylim(top=5.1)

save_fig("mixture_anomaly_detection_plot")
plt.show()

Saving figure mixture_anomaly_detection_plot


gm.bic(X)

8189.662685850679


gm.aic(X)

8102.437405735641


# p = (군집수 - 1) + (군집수 * 차원) + (군집수 * 차원 * (차원+1) // 2)
n_clusters = 3
n_dims = 2
n_params_for_weights = n_clusters - 1      # 군집별 가중치
n_params_for_means = n_clusters * n_dims   # 군집별 평균값
n_params_for_covariance = n_clusters * n_dims * (n_dims + 1) // 2   # 군집별 공분산
n_params = n_params_for_weights + n_params_for_means + n_params_for_covariance

# 각 샘플에 대한 최대 로그 가능도를 모두 더한 값 
max_log_likelihood = gm.score(X) * len(X)

bic = np.log(len(X)) * n_params - 2 * max_log_likelihood
aic = 2 * n_params - 2 * max_log_likelihood


bic, aic

(8189.662685850679, 8102.437405735641)


gms_per_k = [GaussianMixture(n_components=k, n_init=10, random_state=42).fit(X)
             for k in range(1, 11)]


bics = [model.bic(X) for model in gms_per_k]
aics = [model.aic(X) for model in gms_per_k]


plt.figure(figsize=(8, 3))

plt.plot(range(1, 11), bics, "bo-", label="BIC")
plt.plot(range(1, 11), aics, "go--", label="AIC")

plt.xlabel("$k$", fontsize=14)
plt.ylabel("Information Criterion", fontsize=14)
plt.axis([1, 9.5, np.min(aics) - 50, np.max(aics) + 50])

plt.annotate('Minimum',
             xy=(3, bics[2]),
             xytext=(0.35, 0.6),
             textcoords='figure fraction',
             fontsize=14,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )

plt.legend()
save_fig("aic_bic_vs_k_plot")
plt.show()

Saving figure aic_bic_vs_k_plot


min_bic = np.infty

for k in range(1, 11):
    for covariance_type in ("full", "tied", "spherical", "diag"):
        bic = GaussianMixture(n_components=k, n_init=10,
                              covariance_type=covariance_type,
                              random_state=42).fit(X).bic(X)
        if bic < min_bic:
            min_bic = bic
            best_k = k
            best_covariance_type = covariance_type


best_k

3


best_covariance_type

'full'


from scipy.stats import norm


# x축 구간: -6에서 4.
xx = np.linspace(-6, 4, 101)
# y축 구간: 1에서 2
ss = np.linspace(1, 2, 101)

# 지정된 구간을 100x100 개의 격자로 쪼갬
XX, SS = np.meshgrid(xx, ss)

# 확률밀도 함숫(pdf)값 지정: 적절한 스케일 사용.
ZZ = 2 * norm.pdf(XX, 1, SS) + norm.pdf(XX, -4, SS)
ZZ = ZZ / ZZ.sum(axis=1)[:,np.newaxis] / (xx[1] - xx[0])


from matplotlib.patches import Polygon

plt.figure(figsize=(8, 4.5))

x_idx = 85   # x=2.5의 인덱스
s_idx = 30   # theta=1.3의 인덱스

# 좌상단 그림: 모델의 파라미터 함수 f(x;theta)
plt.subplot(221)
plt.contourf(XX, SS, ZZ, cmap="GnBu")
plt.plot([-6, 4], [ss[s_idx], ss[s_idx]], "k-", linewidth=2)
plt.plot([xx[x_idx], xx[x_idx]], [1, 2], "b-", linewidth=2)
plt.xlabel(r"$x$")
plt.ylabel(r"$\theta$", fontsize=14, rotation=0)
plt.title(r"Model $f(x; \theta)$", fontsize=14)

# 우상단 그림: x=2.5일 때의 가능도 함수 L(theta|x=2.5) = f(x=2.5;theta)
plt.subplot(222)
plt.plot(ss, ZZ[:, x_idx], "b-")
max_idx = np.argmax(ZZ[:, x_idx])     # MLE
max_val = np.max(ZZ[:, x_idx])
plt.plot(ss[max_idx], max_val, "r.")  # MLE 표시(빨강 점)
plt.plot([ss[max_idx], ss[max_idx]], [0, max_val], "r:")
plt.plot([0, ss[max_idx]], [max_val, max_val], "r:")
plt.text(1.01, max_val + 0.005, r"$\hat{L}$", fontsize=14)
plt.text(ss[max_idx]+ 0.01, 0.055, r"$\hat{\theta}$", fontsize=14)
plt.text(ss[max_idx]+ 0.01, max_val - 0.012, r"$Max$", fontsize=12)
plt.axis([1, 2, 0.05, 0.15])
plt.xlabel(r"$\theta$", fontsize=14)
plt.grid(True)
plt.text(1.99, 0.135, r"$=f(x=2.5; \theta)$", fontsize=14, ha="right")
plt.title(r"Likelihood function $\mathcal{L}(\theta|x=2.5)$", fontsize=14)

# 좌하단 그림: theta=1.3일 때의 확률밀도함수(pdf) f(x;theta=1.3)
plt.subplot(223)
plt.plot(xx, ZZ[s_idx], "k-")
plt.axis([-6, 4, 0, 0.25])
plt.xlabel(r"$x$", fontsize=14)
plt.grid(True)
plt.title(r"PDF $f(x; \theta=1.3)$", fontsize=14)
verts = [(xx[41], 0)] + list(zip(xx[41:81], ZZ[s_idx, 41:81])) + [(xx[80], 0)]
poly = Polygon(verts, facecolor='0.9', edgecolor='0.5')
plt.gca().add_patch(poly)

# 우하단 그림: x=2.5일 때 로그 가능도 함수 log L(theta|x=2.5)
plt.subplot(224)
plt.plot(ss, np.log(ZZ[:, x_idx]), "b-")
max_idx = np.argmax(np.log(ZZ[:, x_idx]))
max_val = np.max(np.log(ZZ[:, x_idx]))
plt.plot(ss[max_idx], max_val, "r.")
plt.plot([ss[max_idx], ss[max_idx]], [-5, max_val], "r:")
plt.plot([0, ss[max_idx]], [max_val, max_val], "r:")
plt.axis([1, 2, -2.4, -2])
plt.xlabel(r"$\theta$", fontsize=14)
plt.text(ss[max_idx]+ 0.01, max_val - 0.05, r"$Max$", fontsize=12)
plt.text(ss[max_idx]+ 0.01, -2.39, r"$\hat{\theta}$", fontsize=14)
plt.text(1.01, max_val + 0.02, r"$\log \, \hat{L}$", fontsize=14)
plt.grid(True)
plt.title(r"$\log \, \mathcal{L}(\theta|x=2.5)$", fontsize=14)

save_fig("likelihood_function_plot")
plt.show()

Saving figure likelihood_function_plot


from sklearn.mixture import BayesianGaussianMixture


bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)
bgm.fit(X)

C:\Users\gslee\anaconda3\lib\site-packages\sklearn\mixture\_base.py:265: ConvergenceWarning: Initialization 10 did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.
  warnings.warn('Initialization %d did not converge. '

BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)


bgm = BayesianGaussianMixture(n_components=10, n_init=10, max_iter=150, random_state=42)

bgm.fit(X)

BayesianGaussianMixture(max_iter=150, n_components=10, n_init=10,
                        random_state=42)


bgm = BayesianGaussianMixture(n_components=10, n_init=15, random_state=42)
bgm.fit(X)

BayesianGaussianMixture(n_components=10, n_init=15, random_state=42)


np.round(bgm.weights_, 2)

array([0.  , 0.39, 0.2 , 0.4 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


plt.figure(figsize=(8, 5))
plot_gaussian_mixture(bgm, X)
plt.show()


# 사전 믿음: 0.01
bgm_low = BayesianGaussianMixture(n_components=10, max_iter=1000, n_init=1,
                                  weight_concentration_prior=0.01, random_state=42)
# 사전 믿음: 10,000
bgm_high = BayesianGaussianMixture(n_components=10, max_iter=1000, n_init=1,
                                  weight_concentration_prior=10000, random_state=42)
# 훈련 세트 크기: 73
nn = 73
bgm_low.fit(X[:nn])
bgm_high.fit(X[:nn])

BayesianGaussianMixture(max_iter=1000, n_components=10, random_state=42,
                        weight_concentration_prior=10000)


np.round(bgm_low.weights_, 2)

array([0.49, 0.51, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


np.round(bgm_high.weights_, 2)

array([0.43, 0.01, 0.01, 0.11, 0.01, 0.01, 0.01, 0.37, 0.01, 0.01])


plt.figure(figsize=(9, 4))

# 왼편 그림
plt.subplot(121)
plot_gaussian_mixture(bgm_low, X[:nn])
plt.title("weight_concentration_prior = 0.01", fontsize=14)

# 오른편 그림
plt.subplot(122)
plot_gaussian_mixture(bgm_high, X[:nn], show_ylabels=False)
plt.title("weight_concentration_prior = 10000", fontsize=14)

save_fig("mixture_concentration_prior_plot")
plt.show()

Saving figure mixture_concentration_prior_plot


X_moons, y_moons = make_moons(n_samples=1000, noise=0.05, random_state=42)


bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)
bgm.fit(X_moons)

BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)


plt.figure(figsize=(9, 3.2))

plt.subplot(121)
plot_data(X_moons)
plt.xlabel("$x_1$", fontsize=14)
plt.ylabel("$x_2$", fontsize=14, rotation=0)

plt.subplot(122)
plot_gaussian_mixture(bgm, X_moons, show_ylabels=False)

save_fig("moons_vs_bgm_plot")
plt.show()

Saving figure moons_vs_bgm_plot


from sklearn.datasets import fetch_olivetti_faces

olivetti = fetch_olivetti_faces()


print(olivetti.DESCR)

.. _olivetti_faces_dataset:

The Olivetti faces dataset
--------------------------

`This dataset contains a set of face images`_ taken between April 1992 and 
April 1994 at AT&T Laboratories Cambridge. The
:func:`sklearn.datasets.fetch_olivetti_faces` function is the data
fetching / caching function that downloads the data
archive from AT&T.

.. _This dataset contains a set of face images: http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html

As described on the original website:

    There are ten different images of each of 40 distinct subjects. For some
    subjects, the images were taken at different times, varying the lighting,
    facial expressions (open / closed eyes, smiling / not smiling) and facial
    details (glasses / no glasses). All the images were taken against a dark
    homogeneous background with the subjects in an upright, frontal position 
    (with tolerance for some side movement).

**Data Set Characteristics:**

    =================   =====================
    Classes                                40
    Samples total                         400
    Dimensionality                       4096
    Features            real, between 0 and 1
    =================   =====================

The image is quantized to 256 grey levels and stored as unsigned 8-bit 
integers; the loader will convert these to floating point values on the 
interval [0, 1], which are easier to work with for many algorithms.

The "target" for this database is an integer from 0 to 39 indicating the
identity of the person pictured; however, with only 10 examples per class, this
relatively small dataset is more interesting from an unsupervised or
semi-supervised perspective.

The original dataset consisted of 92 x 112, while the version available here
consists of 64x64 images.

When using these images, please give credit to AT&T Laboratories Cambridge.


olivetti.target

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
       13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
       18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
       20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22,
       22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
       23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25,
       25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
       27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30,
       30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
       32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
       34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
       35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39,
       39, 39, 39, 39, 39, 39, 39, 39, 39])


from sklearn.model_selection import StratifiedShuffleSplit

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=40, random_state=42)
train_valid_idx, test_idx = next(strat_split.split(olivetti.data, olivetti.target))
X_train_valid = olivetti.data[train_valid_idx]
y_train_valid = olivetti.target[train_valid_idx]
X_test = olivetti.data[test_idx]
y_test = olivetti.target[test_idx]

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=80, random_state=43)
train_idx, valid_idx = next(strat_split.split(X_train_valid, y_train_valid))
X_train = X_train_valid[train_idx]
y_train = y_train_valid[train_idx]
X_valid = X_train_valid[valid_idx]
y_valid = y_train_valid[valid_idx]


print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(280, 4096) (280,)
(80, 4096) (80,)
(40, 4096) (40,)


from sklearn.decomposition import PCA

pca = PCA(0.99)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

pca.n_components_

199


from sklearn.cluster import KMeans

k_range = range(5, 150, 5)
kmeans_per_k = []
for k in k_range:
    print("k={}".format(k))
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X_train_pca)
    kmeans_per_k.append(kmeans)

k=5
k=10
k=15
k=20
k=25
k=30
k=35
k=40
k=45
k=50
k=55
k=60
k=65
k=70
k=75
k=80
k=85
k=90
k=95
k=100
k=105
k=110
k=115
k=120
k=125
k=130
k=135
k=140
k=145


from sklearn.metrics import silhouette_score

silhouette_scores = [silhouette_score(X_train_pca, model.labels_)
                     for model in kmeans_per_k]
best_index = np.argmax(silhouette_scores)
best_k = k_range[best_index]
best_score = silhouette_scores[best_index]

plt.figure(figsize=(8, 3))
plt.plot(k_range, silhouette_scores, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.plot(best_k, best_score, "rs")
plt.show()


best_k

100


inertias = [model.inertia_ for model in kmeans_per_k]
best_inertia = inertias[best_index]

plt.figure(figsize=(8, 3.5))
plt.plot(k_range, inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.plot(best_k, best_inertia, "rs")
plt.show()


best_model = kmeans_per_k[best_index]


def plot_faces(faces, labels, n_cols=5):
    n_rows = (len(faces) - 1) // n_cols + 1
    plt.figure(figsize=(n_cols, n_rows * 1.1))
    for index, (face, label) in enumerate(zip(faces, labels)):
        plt.subplot(n_rows, n_cols, index + 1)
        plt.imshow(face.reshape(64, 64), cmap="gray")
        plt.axis("off")
        plt.title(label)
    plt.show()

for cluster_id in np.unique(best_model.labels_):
    print("Cluster", cluster_id)
    in_cluster = best_model.labels_==cluster_id
    faces = X_train[in_cluster].reshape(-1, 64, 64)
    labels = y_train[in_cluster]
    plot_faces(faces, labels)

Cluster 0

Cluster 1

Cluster 2

Cluster 3

Cluster 4

Cluster 5

Cluster 6

Cluster 7

Cluster 8

Cluster 9


from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=42)
clf.fit(X_train_pca, y_train)
clf.score(X_valid_pca, y_valid)

0.9


X_train_reduced = best_model.transform(X_train_pca)
X_valid_reduced = best_model.transform(X_valid_pca)
X_test_reduced = best_model.transform(X_test_pca)

clf = RandomForestClassifier(n_estimators=150, random_state=42)
clf.fit(X_train_reduced, y_train)
    
clf.score(X_valid_reduced, y_valid)

0.75


from sklearn.pipeline import Pipeline

for n_clusters in k_range:
    pipeline = Pipeline([
        ("kmeans", KMeans(n_clusters=n_clusters, random_state=42)),
        ("forest_clf", RandomForestClassifier(n_estimators=150, random_state=42))
    ])
    pipeline.fit(X_train_pca, y_train)
    print(n_clusters, pipeline.score(X_valid_pca, y_valid))

5 0.4125
10 0.525
15 0.5375
20 0.6375
25 0.65
30 0.6375
35 0.675
40 0.7375
45 0.725
50 0.75
55 0.7375
60 0.725
65 0.7375
70 0.725
75 0.725
80 0.775
85 0.7375
90 0.7375
95 0.75
100 0.75
105 0.75
110 0.7375
115 0.7375
120 0.75
125 0.75
130 0.725
135 0.75
140 0.7625
145 0.7375


X_train_extended = np.c_[X_train_pca, X_train_reduced]
X_valid_extended = np.c_[X_valid_pca, X_valid_reduced]
X_test_extended = np.c_[X_test_pca, X_test_reduced]


clf = RandomForestClassifier(n_estimators=150, random_state=42)
clf.fit(X_train_extended, y_train)
clf.score(X_valid_extended, y_valid)

0.825


from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=40, random_state=42)
y_pred = gm.fit_predict(X_train_pca)


n_gen_faces = 20
gen_faces_reduced, y_gen_faces = gm.sample(n_samples=n_gen_faces)
gen_faces = pca.inverse_transform(gen_faces_reduced)


plot_faces(gen_faces, y_gen_faces)


n_rotated = 4
rotated = np.transpose(X_train[:n_rotated].reshape(-1, 64, 64), axes=[0, 2, 1])
rotated = rotated.reshape(-1, 64*64)
y_rotated = y_train[:n_rotated]

n_flipped = 3
flipped = X_train[:n_flipped].reshape(-1, 64, 64)[:, ::-1]
flipped = flipped.reshape(-1, 64*64)
y_flipped = y_train[:n_flipped]

n_darkened = 3
darkened = X_train[:n_darkened].copy()
darkened[:, 1:-1] *= 0.3
darkened = darkened.reshape(-1, 64*64)
y_darkened = y_train[:n_darkened]

X_bad_faces = np.r_[rotated, flipped, darkened]
y_bad = np.concatenate([y_rotated, y_flipped, y_darkened])

plot_faces(X_bad_faces, y_bad)


X_bad_faces_pca = pca.transform(X_bad_faces)


gm.score_samples(X_bad_faces_pca)

array([-1.79997468e+07, -2.26427421e+07, -3.96415646e+07, -4.60254380e+07,
       -3.13975227e+07, -1.39330251e+07, -2.90882963e+07, -1.06298693e+08,
       -1.20931144e+08, -7.49740718e+07])


gm.score_samples(X_train_pca[:10])

array([1163.02020938, 1149.16682072, 1148.47710555, 1170.67602773,
       1088.46009527, 1075.7170049 , 1075.71700925, 1088.46008902,
       1096.42609678, 1119.68627018])


X_train_pca

array([[ 3.7807992e+00, -1.8547927e+00, -5.1440420e+00, ...,
        -1.3563001e-01, -2.1408510e-01,  6.1194517e-02],
       [ 1.0148863e+01, -1.5275445e+00, -7.6698363e-01, ...,
         1.2393168e-01, -1.3526660e-01, -2.3265788e-02],
       [-1.0015284e+01,  2.8772824e+00, -9.1987586e-01, ...,
         7.2610505e-02, -2.9626514e-03,  1.2489169e-01],
       ...,
       [ 2.4758759e+00,  2.9559698e+00,  1.2998563e+00, ...,
        -2.0908976e-02,  3.4845721e-02, -1.5432714e-01],
       [-3.2203169e+00,  5.3489785e+00,  1.3942686e+00, ...,
         5.7551935e-02, -2.2830766e-01,  1.5557502e-01],
       [-9.2287689e-01, -3.6470294e+00,  2.2608802e+00, ...,
         1.3684936e-01, -6.9123939e-02,  6.2689997e-02]], dtype=float32)


def reconstruction_errors(pca, X):
    X_pca = pca.transform(X)
    X_reconstructed = pca.inverse_transform(X_pca)
    mse = np.square(X_reconstructed - X).mean(axis=-1)
    return mse


reconstruction_errors(pca, X_train).mean()

0.00019205351


reconstruction_errors(pca, X_bad_faces).mean()

0.004707354


plot_faces(X_bad_faces, y_gen_faces)


X_bad_faces_reconstructed = pca.inverse_transform(X_bad_faces_pca)
plot_faces(X_bad_faces_reconstructed, y_gen_faces)

9장 비지도학습¶

주요 내용¶

기본 설정¶

9.1 군집화¶

분류 대 군집화¶

9.1.1 k-평균¶

군집화 훈련과 예측¶

결정경계¶

군집화와 차원축소¶

k-평균 알고리즘¶

사이킷런의 KMeans 클래스¶

센트로이드 초기화 문제와 해결법¶

초기화 문제¶

관성(inertia, 이너셔)¶

초기화 반복¶

K-Means++ 알고리즘¶

init 하이퍼파라미터 활용¶

개선된 k-평균 알고리즘과 미니배치 k-평균¶

개선된 k-평균 알고리즘¶

미니배치 k-평균¶

훈련 세트가 많이 큰 경우: memmap 클래스 활용¶

훈련 세트가 너무 큰 경우: partial_fit() 활용¶

최적의 군집수¶

관성 활용¶

실루엣 점수 활용¶

9.1.2 k-평균의 한계¶

특성 스케일링 활용¶

9.1.3 군집화 활용: 이미지 색상 분할¶

이미지 색상 분할 과정¶

군집수에 따른 비교¶

9.1.4 군집화 활용: 차원축소¶

로지스틱 회귀 적용¶

군집화 전처리 후 성능 평가¶

그리드 탐색 활용¶

9.1.5 군집화 활용: 준지도학습¶

준지도학습¶

대표이미지 활용¶

레이블 전파 1¶

레이블 전파 2¶

레이블 전파 정확도¶

9.1.6 DBSCAN¶

DBSCAN과 예측하기¶

k-최근접 이웃 분류기¶

이상치 처리¶

계층적 DBSCAN: HDBSCAN¶

9.1.7 기타 군집화 알고리즘¶

스펙트럼 군집화(Spectral Clustering)¶

병합 군집화(Agglomerative Clustering)¶

9.2 가우시안 혼합(Gaussian Mixtures)¶

predict() 와 predict_proba()¶

생성모델¶

✋ score_samples() 와 확률밀도¶

결정경계와 밀도 등고선¶

covariance_type 속성과 군집 형태¶

9.2.1 가우시안 혼합과 이상치 탐지¶

9.2.2 군집수 선택¶

✋ BIC와 AIC¶

군집수에 따른 BIC와 AIC¶

✋ 가능도(likelihood) 함수¶

9.2.3 베이즈 가우시안 혼합 모델¶

사전 확률¶

가우시안 혼합 모델의 한계¶

연습문제¶

1. to 9.¶

10. Cluster the Olivetti Faces Dataset¶

11. Using Clustering as Preprocessing for Classification¶

12. A Gaussian Mixture Model for the Olivetti Faces Dataset¶

13. Using Dimensionality Reduction Techniques for Anomaly Detection¶

사이킷런의 `KMeans` 클래스¶

`init` 하이퍼파라미터 활용¶

훈련 세트가 많이 큰 경우: `memmap` 클래스 활용¶

훈련 세트가 너무 큰 경우: `partial_fit()` 활용¶

`predict()` 와 `predict_proba()`¶

✋ `score_samples()` 와 확률밀도¶

`covariance_type` 속성과 군집 형태¶