머신 러닝 (21) SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
print("Loading Pima Indians Diabetes Dataset...")
pima = fetch_openml(name='diabetes', version=1, as_frame=True)
X = pima.data
# 'tested_negative'를 0으로, 'tested_positive'를 1로 매핑하여 정수형으로 변환
y = pima.target.map({'tested_negative': 0, 'tested_positive': 1}).astype(int) #pima.target ->Series
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(X.columns)}")
print(f"\n클래스 분포:\n{y.value_counts()}")
# 데이터 분리 (stratify로 클래스 비율 유지)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 피처 선택용 선형 SVC
svc_linear = SVC(kernel="linear", random_state=42)
# RFECV 설정
rfecv = RFECV(
estimator=svc_linear,
step=1,
cv=StratifiedKFold(5),
scoring='accuracy',
min_features_to_select=1
)
# Pipeline 구축
pipe_svc = Pipeline([
('scaler', StandardScaler()),
('feature_selection', rfecv),
('classifier', SVC(kernel='rbf', random_state=42))
])
print("Pipeline 구성 완료")
print("\nPipeline 구조:")
print(pipe_svc)
# 학습
print("Training SVC Pipeline with RFECV...")
print("(교차검증 수행 중... 시간이 소요됩니다)")
pipe_svc.fit(X_train, y_train)
print("\n학습 완료!")
# 선택된 피처 정보 추출
fs_step = pipe_svc.named_steps['feature_selection']
n_features = fs_step.n_features_
selected_features = X.columns[fs_step.support_].tolist()
print("[결과 진단]")
print(f"알고리즘이 선택한 최적의 피처 개수: {n_features}개")
print(f"\n선택된 피처 목록:")
for i, feat in enumerate(selected_features, 1):
print(f" {i}. {feat}")
# 피처별 순위 확인
feature_ranking = pd.DataFrame({
'Feature': X.columns,
'Rank': fs_step.ranking_,
'Selected': fs_step.support_
}).sort_values('Rank')
print("\n피처별 순위 (1이 선택됨):")
print(feature_ranking)
# 예측 및 평가
y_pred = pipe_svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))
print("선형 SVC 생성 완료 (피처 선택용)")
# 선형 커널만 사용하는 Pipeline
pipe_linear_only = Pipeline([
('scaler', StandardScaler()),
('feature_selection', RFECV(
estimator=SVC(kernel="linear", random_state=42),
step=1,
cv=StratifiedKFold(5),
scoring='accuracy',
min_features_to_select=1
)),
('classifier', SVC(kernel='linear', random_state=42))
])
print("선형 커널 전용 Pipeline 학습 중...")
pipe_linear_only.fit(X_train, y_train)
y_pred_linear = pipe_linear_only.predict(X_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)
# 비교
print("\n커널 전략 비교:")
print(f"Linear 전용: Accuracy = {accuracy_linear:.4f}")
print(f"Linear(선택) + RBF(예측): Accuracy = {accuracy:.4f}")
print(f"\n정확도 향상: {accuracy - accuracy_linear:+.4f}")
# RFECV 성능 변화 시각화
plt.figure(figsize=(10, 6))
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross Validation Score (Accuracy)")
cv_scores = fs_step.cv_results_['mean_test_score']
plt.plot(
range(1, len(cv_scores) + 1),
cv_scores,
marker='o',
linewidth=2,
markersize=8
)
# 최적 지점 표시
optimal_score = cv_scores[n_features - 1]
plt.axvline(x=n_features, color='r', linestyle='--',
label=f'Optimal: {n_features} features')
plt.scatter([n_features], [optimal_score],
color='red', s=200, zorder=5, marker='*')
plt.title("RFECV (Linear SVC): Accuracy vs Number of Features")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 피처 개수별 점수 분석
cv_results_df = pd.DataFrame({
'Num_Features': range(1, len(cv_scores) + 1),
'Mean_Score': cv_scores,
'Std_Score': fs_step.cv_results_['std_test_score']
})
print("\n피처 개수별 교차검증 점수:")
print(cv_results_df)
print(f"\n최고 점수: {cv_results_df['Mean_Score'].max():.4f}")
print(f"최적 선택: {n_features}개 피처 (Score: {optimal_score:.4f})")
댓글
댓글 쓰기