머신 러닝 (19) Hierarchical Clustering

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score # silhouette_score 추가
import warnings
warnings.filterwarnings('ignore')

 diabetes = load_diabetes()

X = diabetes.data
y = diabetes.target
feature_names = diabetes.feature_names

df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

X_features = df[feature_names].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

k_range = range(2, 11)  # 2~10개 군집
sil_scores = []

for k in k_range:
    hc = AgglomerativeClustering(
        n_clusters=k,
        metric='euclidean',   # 거리 지표 (affinity -> metric으로 변경)
        linkage='ward'          # Ward linkage (연속형 변수에 적합)
    )
    labels = hc.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    sil_scores.append(score)
    print(f"k = {k:2d}, 실루엣 점수 = {score:.4f}")

# 최적 k 선택
best_k = k_range[int(np.argmax(sil_scores))]
best_score = max(sil_scores)

print("\n=== 실루엣 기준 최적 k 선택 결과 (계층적 군집) ===")
print(f"최적 군집 수 k = {best_k}, 실루엣 점수 = {best_score:.4f}")

best_hc = AgglomerativeClustering(
    n_clusters=best_k,
    metric='euclidean', # affinity -> metric으로 변경
    linkage='ward'
)
cluster_labels = best_hc.fit_predict(X_scaled)

df['cluster'] = cluster_labels

print("군집별 샘플 수 ")
display(df['cluster'].value_counts().sort_index())


print("군집별 타깃(질병 진행도) 평균")
display(df.groupby('cluster')['target'].mean())

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(6, 4))
scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=cluster_labels,
    alpha=0.7
)

plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title(f"Diabetes 데이터셋 계층적 군집 (k={best_k}, linkage='ward')")
plt.colorbar(scatter, label="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()


댓글

이 블로그의 인기 게시물

베이스 캠프에서 (1)

베이스 캠프에서 (2)

Database 분석 (4)