머신 러닝 (24) XGBoost Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import accuracy_score, classification_report
print("Loading Pima Indians Diabetes Dataset...")
pima = fetch_openml(name='diabetes', version=1, as_frame=True)
X = pima.data
y = pima.target.map({'tested_negative': 0, 'tested_positive': 1}).astype(int)
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(X.columns)}")
print(f"\n클래스 분포:\n{y.value_counts()}")
# 데이터 샘플
print("\n데이터 샘플:")
print(X.head())
print("\n기술 통계:")
print(X.describe())
# 데이터 분리 (stratify로 클래스 비율 유지)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"\nTrain 클래스 분포:\n{y_train.value_counts()}")
# XGBoost Classifier
xgb_clf = XGBClassifier(
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
print("XGBoost Classifier 생성")
print(f"기본 설정: n_estimators=100, learning_rate=0.3, max_depth=6")
# 하이퍼파라미터 그리드
param_grid = {
'n_estimators': [100, 200],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 7],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
print("파라미터 그리드:")
for param, values in param_grid.items():
print(f" {param}: {values}")
total_combinations = 1
for values in param_grid.values():
total_combinations *= len(values)
print(f"\n총 조합 수: {total_combinations}개")
# GridSearchCV
print("\nTuning XGBoost Classifier...")
print("(교차검증 수행 중...)\n")
grid_clf = GridSearchCV(
xgb_clf, #XGBoost 모델(객체) or PipeLine
param_grid,
cv=StratifiedKFold(5),
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_clf.fit(X_train, y_train)
print("\nGridSearchCV 완료!")
# 최적 모델 추출
best_clf = grid_clf.best_estimator_ #중요 성능이 좋은 모델 선정
print("[최적화 결과]")
print(f"Best Accuracy (Train CV): {grid_clf.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in grid_clf.best_params_.items():
print(f" {param}: {value}")
# 상위 5개 결과
cv_results = pd.DataFrame(grid_clf.cv_results_)
top_results = cv_results.nlargest(5, 'mean_test_score')[[
'param_n_estimators',
'param_learning_rate',
'param_max_depth',
'param_subsample',
'param_colsample_bytree',
'mean_test_score',
'std_test_score'
]]
print("\n상위 5개 파라미터 조합:")
print(top_results.to_string(index=False))
# 테스트 데이터 예측
y_pred = best_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))
from sklearn.metrics import confusion_matrix
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['No Diabetes', 'Diabetes'],
yticklabels=['No Diabetes', 'Diabetes'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()
# XGBoost plot_importance
plt.figure(figsize=(10, 8))
plot_importance(best_clf, max_num_features=10,
importance_type='gain', height=0.5)
plt.title("XGBoost Feature Importance (Gain)")
plt.tight_layout()
plt.show()
# 피처 중요도 추출
importances = best_clf.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]
# 중요도 정렬
importance_df = pd.DataFrame({
'Feature': feature_names[indices],
'Importance': importances[indices]
})
print("\n피처 중요도 순위:")
print(importance_df.to_string(index=False))
# 시각화
plt.figure(figsize=(6, 4))
sns.barplot(x=importances[indices], y=feature_names[indices], palette='viridis')
plt.title("Feature Importances (XGBoost)")
plt.xlabel("Importance (Gain)")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
# Early Stopping with eval_set
xgb_early = XGBClassifier(
**grid_clf.best_params_,
early_stopping_rounds=10,
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
# Validation set 분리
X_tr, X_val, y_tr, y_val = train_test_split(
X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)
# 학습 (eval_set 제공)
xgb_early.fit(
X_tr, y_tr,
eval_set=[(X_val, y_val)],
verbose=False
)
print(f"\nEarly Stopping:")
print(f"최적 트리 개수: {xgb_early.best_iteration + 1}")
print(f"설정 트리 개수: {grid_clf.best_params_['n_estimators']}")
print(f"Test Accuracy: {xgb_early.score(X_test, y_test):.4f}")
댓글
댓글 쓰기