머신 러닝 (10) 비지도 PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# Feature(X)와 Target(y) 분리
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"스케일링된 데이터 형태: {X_scaled.shape}")
# 2개의 주성분으로 축소
pca = PCA(n_components=X_scaled.shape[1])
X_pca = pca.fit_transform(X_scaled)
print(f"PCA 변환 후 차원: {X_pca.shape}")
print(f"주성분별 설명된 분산 비율: {pca.explained_variance_ratio_}")
print(f"총 설명된 분산 비율: {np.sum(pca.explained_variance_ratio_):.4f}")
# 전체 주성분에 대한 PCA 수행
pca_full = PCA(n_components=X_scaled.shape[1])
pca_full.fit(X_scaled)
# 설명된 분산 비율 및 누적 분산 비율 계산
explained_variance = pca_full.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
# 2개의 주성분으로 축소
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA 변환 후 차원: {X_pca.shape}")
print(f"주성분별 설명된 분산 비율: {pca.explained_variance_ratio_}")
print(f"총 설명된 분산 비율: {np.sum(pca.explained_variance_ratio_):.4f}")
# Scree Plot 시각화
plt.figure(figsize=(8, 5))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center',
label='Individual explained variance')
plt.step(range(1, len(cumulative_variance) + 1), cumulative_variance, where='mid',
label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title('Scree Plot')
plt.legend(loc='best')
plt.grid(True)
plt.show()
plt.figure(figsize=(6, 4))
# Outcome(당뇨병 발병 여부)에 따라 색상 구분 (0: 정상, 1: 당뇨)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7, edgecolors='k')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Diabetes Dataset')
plt.colorbar(scatter, label='Diabetes Target Value')
plt.grid(True)
plt.show()
댓글
댓글 쓰기