머신 러닝 (12) DBSCAN Clustering
wine 데이터셋을 이용한 DBSCAN 클러스터링
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
# 와인 데이터셋 로드
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
# 데이터 정규화 (StandardScaler)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"데이터 크기: {X.shape}")
print(f"\n데이터 샘플:\n{X.head()}")
# 2차원 주성분으로 축소
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print(f"설명된 분산 비율: {pca.explained_variance_ratio_}")
print(f"총 설명된 분산: {sum(pca.explained_variance_ratio_):.2%}")
# 5-최근접 이웃 거리 계산
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_pca)
distances, indices = neighbors_fit.kneighbors(X_pca)
# 5번째 이웃까지의 거리를 오름차순으로 정렬
distances = np.sort(distances[:, 4], axis=0)
plt.figure(figsize=(6, 3))
plt.plot(distances)
plt.title("K-Distance Plot (Wine Dataset)")
plt.xlabel("Points sorted by distance")
plt.ylabel("Distance to 5th Nearest Neighbor")
plt.grid(True)
print("그래프가 급격히 꺾이는 지점(Elbow)의 Y축 값을 확인하세요.")
plt.show()
DBSCAN 모델 적용
# DBSCAN 클러스터링 수행
dbscan = DBSCAN(eps=0.55, min_samples=5)
clusters = dbscan.fit_predict(X_pca)
# 클러스터 레이블 확인 (-1은 노이즈)
unique_labels = np.unique(clusters)
print(f"생성된 클러스터 라벨: {unique_labels}")
# 각 클러스터별 샘플 개수
for label in unique_labels:
count = np.sum(clusters == label)
if label == -1:
print(f"노이즈(Outlier): {count}개")
else:
print(f"클러스터 {label}: {count}개")
결과 시각화
plt.figure(figsize=(6, 4))
# 클러스터별 색상 생성
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))
for label, color in zip(unique_labels, colors):
if label == -1:
# 노이즈(Outlier)는 검정색 'x'로 표시
color = 'k'
label_name = "Noise (-1)"
marker = 'x'
alpha = 0.5
else:
# 일반 군집
label_name = f"Cluster {label}"
marker = 'o'
alpha = 0.7
class_member_mask = (clusters == label)
xy = X_pca[class_member_mask]
plt.scatter(xy[:, 0], xy[:, 1],
c=[color],
label=label_name,
s=60, edgecolors='k', alpha=alpha, marker=marker)
plt.title('DBSCAN Clustering Results (Wine Dataset + PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()
댓글
댓글 쓰기