머신 러닝 (20) GMM Clustering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
# 와인 데이터셋 로드
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"데이터 크기: {X.shape}")
print(f"\n데이터 샘플:\n{X.head()}")
PCA 적용
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print(f"설명된 분산 비율: {pca.explained_variance_ratio_}")
print(f"총 설명된 분산: {sum(pca.explained_variance_ratio_):.2%}")
# GMM 모델 설정 및 학습
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_pca)
# 군집 예측
labels = gmm.predict(X_pca)
print(f"발견된 군집 개수: {len(np.unique(labels))}")
print(f"\n군집 중심점 (평균):\n{gmm.means_}")
# 각 군집별 샘플 개수
for k in range(3):
count = np.sum(labels == k)
print(f"클러스터 {k}: {count}개")
# 각 군집에 속할 확률 계산
probs = gmm.predict_proba(X_pca)
print("첫 5개 샘플의 군집별 소속 확률:")
print("Sample | Cluster 0 | Cluster 1 | Cluster 2")
print("-" * 45)
for i in range(5):
print(f" {i:2d} | {probs[i,0]:9.6f} | {probs[i,1]:9.6f} | {probs[i,2]:9.6f}")
plt.figure(figsize=(6, 4))
# 군집 중심점(평균) 추출
centers = gmm.means_
# 군집별 색상 생성
colors = plt.cm.viridis(np.linspace(0, 1, 3))
for k, col in zip(range(3), colors):
my_members = (labels == k)
# 군집 데이터 포인트
plt.plot(X_pca[my_members, 0], X_pca[my_members, 1], 'o',
markerfacecolor=col, markeredgecolor='k', markersize=8, alpha=0.6)
# 가우시안 분포의 중심점
plt.plot(centers[k, 0], centers[k, 1], '*',
markerfacecolor='red', markeredgecolor='k', markersize=20)
plt.title('GMM Clustering Results (Wine Dataset + PCA)\nComponents: 3')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
# GMM 결과
colors_gmm = plt.cm.viridis(np.linspace(0, 1, 3))
for k, col in zip(range(3), colors_gmm):
my_members = (labels == k)
axes[0].scatter(X_pca[my_members, 0], X_pca[my_members, 1],
c=[col], label=f'Cluster {k}', s=60, edgecolors='k', alpha=0.6)
axes[0].scatter(centers[k, 0], centers[k, 1],
marker='*', c='red', s=300, edgecolors='k')
axes[0].set_title('GMM (3 components)')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].legend()
axes[0].grid(True)
# 원본 레이블
colors_true = plt.cm.Set1(np.linspace(0, 1, len(np.unique(wine.target))))
for k, col in zip(np.unique(wine.target), colors_true):
my_members = (wine.target == k)
axes[1].scatter(X_pca[my_members, 0], X_pca[my_members, 1],
c=[col], label=f'Class {k}', s=60, edgecolors='k', alpha=0.6)
axes[1].set_title(f'True Labels ({len(np.unique(wine.target))} classes)')
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()
댓글
댓글 쓰기