머신 러닝 (26) Optuna Hyperparameter Optimization
!pip install optuna catboost koreanize-matplotlib -q
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import warnings
import time
warnings.filterwarnings('ignore')
# Optuna
import optuna
from optuna import Trial
from optuna.visualization import (
plot_optimization_history,
plot_param_importances,
plot_slice,
plot_contour
)
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
# Optuna 로깅 레벨 조정
optuna.logging.set_verbosity(optuna.logging.WARNING)
# 랜덤 시드 고정
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
print(f"Optuna 버전: {optuna.__version__}")
print("Loading Pima Indians Diabetes Dataset...")
pima = fetch_openml(name='diabetes', version=1, as_frame=True)
X = pima.data
y = pima.target.map({'tested_negative': 0, 'tested_positive': 1}).astype(int)
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(X.columns)}")
print(f"\n클래스 분포:\n{y.value_counts()}")
print(f"\n클래스 비율:\n{y.value_counts(normalize=True).round(3)}")
print("\n데이터 샘플:")
print(X.head())
print("\n기술 통계:")
print(X.describe())
# 데이터 분할 (stratify로 클래스 비율 유지)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"\nTrain 클래스 분포:\n{y_train.value_counts()}")
print("\n" + "="*60)
print("Optuna 기본 예제: XGBoost 간단 최적화")
print("="*60)
def objective_simple(trial):
"""
Optuna 목적 함수 (Objective Function)
- trial: Optuna가 제공하는 하이퍼파라미터 제안 객체
- 반환값: 최대화 또는 최소화할 지표
"""
# 하이퍼파라미터 제안
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 200),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'random_state': RANDOM_STATE,
'eval_metric': 'logloss',
'use_label_encoder': False
}
# 모델 생성 및 평가
model = XGBClassifier(**params)
# 교차 검증 점수 (최대화할 지표)
cv_score = cross_val_score(
model, X_train, y_train,
cv=3, scoring='accuracy', n_jobs=-1
).mean()
return cv_score
# Optuna Study 생성 (최대화 목표)
study_simple = optuna.create_study(
direction='maximize', # 점수를 최대화
study_name='xgboost_simple'
)
# 최적화 실행 (20회 시도)
print("\n최적화 시작 (20회 시도)...")
study_simple.optimize(objective_simple, n_trials=20, show_progress_bar=True)
# 결과 출력
print(f"최고 정확도: {study_simple.best_value:.4f}")
print(f"\n최적 하이퍼파라미터:")
for param, value in study_simple.best_params.items():
print(f" {param}: {value}")
print("\n" + "="*60)
print("XGBoost Optuna 최적화")
print("="*60)
def objective_xgboost(trial):
# 하이퍼파라미터 탐색 공간 정의
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'gamma': trial.suggest_float('gamma', 0, 0.5),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
'random_state': RANDOM_STATE,
'eval_metric': 'logloss',
'use_label_encoder': False
}
model = XGBClassifier(**params)
cv_score = cross_val_score(
model, X_train, y_train,
cv=5, scoring='accuracy', n_jobs=-1
).mean()
return cv_score
# Study 생성 및 최적화
study_xgb = optuna.create_study(
direction='maximize',
study_name='xgboost_full'
)
print("\n최적화 시작 (50회 시도)...")
start_time = time.time()
study_xgb.optimize(objective_xgboost, n_trials=50, show_progress_bar=True)
xgb_time = time.time() - start_time
print(f"\n XGBoost 최적화 완료! (소요 시간: {xgb_time:.2f}초)")
print(f"최고 CV 점수: {study_xgb.best_value:.4f}")
print(f"\n최적 하이퍼파라미터:")
for param, value in study_xgb.best_params.items():
print(f" {param}: {value}")
print("\n" + "="*60)
print("LightGBM Optuna 최적화")
print("="*60)
def objective_lgbm(trial):
# 하이퍼파라미터 탐색 공간 정의
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'num_leaves': trial.suggest_int('num_leaves', 20, 100),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
'random_state': RANDOM_STATE,
'verbose': -1
}
model = LGBMClassifier(**params)
cv_score = cross_val_score(
model, X_train, y_train,
cv=5, scoring='accuracy', n_jobs=-1
).mean()
return cv_score
# Study 생성 및 최적화
study_lgbm = optuna.create_study(
direction='maximize',
study_name='lightgbm_full'
)
print("\n최적화 시작 (50회 시도)...")
start_time = time.time()
study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=True)
lgbm_time = time.time() - start_time
print(f"\n LightGBM 최적화 완료! (소요 시간: {lgbm_time:.2f}초)")
print(f"최고 CV 점수: {study_lgbm.best_value:.4f}")
print(f"\n최적 하이퍼파라미터:")
for param, value in study_lgbm.best_params.items():
print(f" {param}: {value}")
print("\n" + "="*60)
print("CatBoost Optuna 최적화")
print("="*60)
def objective_catboost(trial):
# 하이퍼파라미터 탐색 공간 정의
params = {
'iterations': trial.suggest_int('iterations', 50, 300),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'depth': trial.suggest_int('depth', 3, 10),
'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
'border_count': trial.suggest_int('border_count', 32, 255),
'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
'random_strength': trial.suggest_float('random_strength', 0, 10),
'random_state': RANDOM_STATE,
'verbose': 0
}
model = CatBoostClassifier(**params)
cv_score = cross_val_score(
model, X_train, y_train,
cv=5, scoring='accuracy', n_jobs=-1
).mean()
return cv_score
# Study 생성 및 최적화
study_cat = optuna.create_study(
direction='maximize',
study_name='catboost_full'
)
print("\n최적화 시작 (50회 시도)...")
start_time = time.time()
study_cat.optimize(objective_catboost, n_trials=50, show_progress_bar=True)
cat_time = time.time() - start_time
print(f"\n CatBoost 최적화 완료! (소요 시간: {cat_time:.2f}초)")
print(f"최고 CV 점수: {study_cat.best_value:.4f}")
print(f"\n최적 하이퍼파라미터:")
for param, value in study_cat.best_params.items():
print(f" {param}: {value}")
fig, axes = plt.subplots(1, 3, figsize=(12, 5))
# XGBoost
study_data = study_xgb.trials_dataframe()
axes[0].plot(study_data['number'], study_data['value'], marker='o', markersize=3)
axes[0].axhline(y=study_xgb.best_value, color='r', linestyle='--', label=f'Best: {study_xgb.best_value:.4f}')
axes[0].set_xlabel('Trial')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('XGBoost Optimization History')
axes[0].legend()
axes[0].grid(alpha=0.3)
# LightGBM
study_data = study_lgbm.trials_dataframe()
axes[1].plot(study_data['number'], study_data['value'], marker='o', markersize=3, color='green')
axes[1].axhline(y=study_lgbm.best_value, color='r', linestyle='--', label=f'Best: {study_lgbm.best_value:.4f}')
axes[1].set_xlabel('Trial')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('LightGBM Optimization History')
axes[1].legend()
axes[1].grid(alpha=0.3)
# CatBoost
study_data = study_cat.trials_dataframe()
axes[2].plot(study_data['number'], study_data['value'], marker='o', markersize=3, color='orange')
axes[2].axhline(y=study_cat.best_value, color='r', linestyle='--', label=f'Best: {study_cat.best_value:.4f}')
axes[2].set_xlabel('Trial')
axes[2].set_ylabel('Accuracy')
axes[2].set_title('CatBoost Optimization History')
axes[2].legend()
axes[2].grid(alpha=0.3)
plt.tight_layout()
plt.show()
# XGBoost 파라미터 중요도
fig = plot_param_importances(study_xgb)
fig.update_layout(title="XGBoost - Parameter Importances", height=500)
fig.show()
# LightGBM 파라미터 중요도
fig = plot_param_importances(study_lgbm)
fig.update_layout(title="LightGBM - Parameter Importances", height=500)
fig.show()
# CatBoost 파라미터 중요도
fig = plot_param_importances(study_cat)
fig.update_layout(title="CatBoost - Parameter Importances", height=500)
fig.show()
print("\n" + "="*80)
print("Optuna 최적 모델 테스트 세트 평가")
print("="*80)
optuna_results = []
# XGBoost
xgb_best = XGBClassifier(**study_xgb.best_params, random_state=RANDOM_STATE,
eval_metric='logloss', use_label_encoder=False)
xgb_best.fit(X_train, y_train)
y_pred_xgb = xgb_best.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb, average='weighted')
optuna_results.append({
'Model': 'XGBoost',
'Method': 'Optuna',
'CV Score': study_xgb.best_value,
'Test Accuracy': xgb_acc,
'Test F1': xgb_f1,
'Time': xgb_time
})
print(f"\nXGBoost - Test Accuracy: {xgb_acc:.4f}, F1: {xgb_f1:.4f}")
# LightGBM
lgbm_best = LGBMClassifier(**study_lgbm.best_params, random_state=RANDOM_STATE, verbose=-1)
lgbm_best.fit(X_train, y_train)
y_pred_lgbm = lgbm_best.predict(X_test)
lgbm_acc = accuracy_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm, average='weighted')
optuna_results.append({
'Model': 'LightGBM',
'Method': 'Optuna',
'CV Score': study_lgbm.best_value,
'Test Accuracy': lgbm_acc,
'Test F1': lgbm_f1,
'Time': lgbm_time
})
print(f"LightGBM - Test Accuracy: {lgbm_acc:.4f}, F1: {lgbm_f1:.4f}")
# CatBoost
cat_best = CatBoostClassifier(**study_cat.best_params, random_state=RANDOM_STATE, verbose=0)
cat_best.fit(X_train, y_train)
y_pred_cat = cat_best.predict(X_test)
cat_acc = accuracy_score(y_test, y_pred_cat)
cat_f1 = f1_score(y_test, y_pred_cat, average='weighted')
optuna_results.append({
'Model': 'CatBoost',
'Method': 'Optuna',
'CV Score': study_cat.best_value,
'Test Accuracy': cat_acc,
'Test F1': cat_f1,
'Time': cat_time
})
print(f"CatBoost - Test Accuracy: {cat_acc:.4f}, F1: {cat_f1:.4f}")
optuna_df = pd.DataFrame(optuna_results)
print("\n" + "="*80)
print(optuna_df.to_string(index=False))
print("\n" + "="*80)
print("GridSearchCV 비교 (제한적 범위)")
print("="*80)
grid_results = []
# XGBoost GridSearch
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1]
}
print("\nXGBoost GridSearch...")
start_time = time.time()
grid_xgb = GridSearchCV(
XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss', use_label_encoder=False),
param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_xgb.fit(X_train, y_train)
grid_xgb_time = time.time() - start_time
y_pred = grid_xgb.predict(X_test)
grid_results.append({
'Model': 'XGBoost',
'Method': 'GridSearch',
'CV Score': grid_xgb.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': grid_xgb_time
})
# LightGBM GridSearch
param_grid_lgbm = {
'n_estimators': [100, 200],
'learning_rate': [0.01, 0.1],
'num_leaves': [31, 50]
}
print("LightGBM GridSearch...")
start_time = time.time()
grid_lgbm = GridSearchCV(
LGBMClassifier(random_state=RANDOM_STATE, verbose=-1),
param_grid_lgbm, cv=5, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_lgbm.fit(X_train, y_train)
grid_lgbm_time = time.time() - start_time
y_pred = grid_lgbm.predict(X_test)
grid_results.append({
'Model': 'LightGBM',
'Method': 'GridSearch',
'CV Score': grid_lgbm.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': grid_lgbm_time
})
# CatBoost GridSearch
param_grid_cat = {
'iterations': [100, 200],
'learning_rate': [0.01, 0.1],
'depth': [4, 6]
}
print("CatBoost GridSearch...")
start_time = time.time()
grid_cat = GridSearchCV(
CatBoostClassifier(random_state=RANDOM_STATE, verbose=0),
param_grid_cat, cv=5, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_cat.fit(X_train, y_train)
grid_cat_time = time.time() - start_time
y_pred = grid_cat.predict(X_test)
grid_results.append({
'Model': 'CatBoost',
'Method': 'GridSearch',
'CV Score': grid_cat.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': grid_cat_time
})
grid_df = pd.DataFrame(grid_results)
print("\n" + "="*80)
print(grid_df.to_string(index=False))
print("\n" + "="*80)
print("RandomizedSearchCV 비교 (50회 시도)")
print("="*80)
random_results = []
# XGBoost RandomSearch
param_dist_xgb = {
'n_estimators': [50, 100, 200, 300],
'max_depth': [3, 5, 7, 10, 15],
'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0]
}
print("\nXGBoost RandomSearch...")
start_time = time.time()
random_xgb = RandomizedSearchCV(
XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss', use_label_encoder=False),
param_dist_xgb, n_iter=50, cv=5, scoring='accuracy',
n_jobs=-1, random_state=RANDOM_STATE, verbose=0
)
random_xgb.fit(X_train, y_train)
random_xgb_time = time.time() - start_time
y_pred = random_xgb.predict(X_test)
random_results.append({
'Model': 'XGBoost',
'Method': 'RandomSearch',
'CV Score': random_xgb.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': random_xgb_time
})
# LightGBM RandomSearch
param_dist_lgbm = {
'n_estimators': [50, 100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'num_leaves': [20, 31, 50, 70],
'max_depth': [3, 5, 10, 15],
'min_child_samples': [5, 10, 20, 30]
}
print("LightGBM RandomSearch...")
start_time = time.time()
random_lgbm = RandomizedSearchCV(
LGBMClassifier(random_state=RANDOM_STATE, verbose=-1),
param_dist_lgbm, n_iter=50, cv=5, scoring='accuracy',
n_jobs=-1, random_state=RANDOM_STATE, verbose=0
)
random_lgbm.fit(X_train, y_train)
random_lgbm_time = time.time() - start_time
y_pred = random_lgbm.predict(X_test)
random_results.append({
'Model': 'LightGBM',
'Method': 'RandomSearch',
'CV Score': random_lgbm.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': random_lgbm_time
})
# CatBoost RandomSearch
param_dist_cat = {
'iterations': [50, 100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'depth': [3, 4, 6, 8, 10],
'l2_leaf_reg': [1, 3, 5, 7, 10]
}
print("CatBoost RandomSearch...")
start_time = time.time()
random_cat = RandomizedSearchCV(
CatBoostClassifier(random_state=RANDOM_STATE, verbose=0),
param_dist_cat, n_iter=50, cv=5, scoring='accuracy',
n_jobs=-1, random_state=RANDOM_STATE, verbose=0
)
random_cat.fit(X_train, y_train)
random_cat_time = time.time() - start_time
y_pred = random_cat.predict(X_test)
random_results.append({
'Model': 'CatBoost',
'Method': 'RandomSearch',
'CV Score': random_cat.best_score_,
'Test Accuracy': accuracy_score(y_test, y_pred),
'Test F1': f1_score(y_test, y_pred, average='weighted'),
'Time': random_cat_time
})
random_df = pd.DataFrame(random_results)
print("\n" + "="*80)
print(random_df.to_string(index=False))
# 전체 결과 통합
all_results = pd.concat([optuna_df, grid_df, random_df], ignore_index=True)
print("\n" + "="*80)
print("전체 결과 비교: Optuna vs GridSearch vs RandomSearch")
print("="*80)
print(all_results.to_string(index=False))
# 방법별 평균 성능
print("\n" + "="*80)
print("방법별 평균 성능")
print("="*80)
method_avg = all_results.groupby('Method').agg({
'CV Score': 'mean',
'Test Accuracy': 'mean',
'Test F1': 'mean',
'Time': 'mean'
}).round(4)
print(method_avg)
# 모델별 최고 성능
print("\n" + "="*80)
print("모델별 최고 성능 (방법 무관)")
print("="*80)
for model in ['XGBoost', 'LightGBM', 'CatBoost']:
model_data = all_results[all_results['Model'] == model]
best_idx = model_data['Test Accuracy'].idxmax()
best = all_results.loc[best_idx]
print(f"\n{model}:")
print(f" 최고 방법: {best['Method']}")
print(f" Test Accuracy: {best['Test Accuracy']:.4f}")
print(f" 소요 시간: {best['Time']:.2f}초")
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Test Accuracy 비교
ax1 = axes[0, 0]
pivot_acc = all_results.pivot_table(values='Test Accuracy', index='Model', columns='Method')
pivot_acc[['Optuna', 'GridSearch', 'RandomSearch']].plot(kind='bar', ax=ax1,
color=['#FF6B6B', '#4ECDC4', '#95E1D3'])
ax1.set_title('Test Accuracy 비교', fontsize=12, fontweight='bold')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('')
ax1.legend(title='Method', loc='lower right')
ax1.grid(axis='y', alpha=0.3)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
# 2. CV Score 비교
ax2 = axes[0, 1]
pivot_cv = all_results.pivot_table(values='CV Score', index='Model', columns='Method')
pivot_cv[['Optuna', 'GridSearch', 'RandomSearch']].plot(kind='bar', ax=ax2,
color=['#FF6B6B', '#4ECDC4', '#95E1D3'])
ax2.set_title('CV Score 비교', fontsize=12, fontweight='bold')
ax2.set_ylabel('CV Score')
ax2.set_xlabel('')
ax2.legend(title='Method', loc='lower right')
ax2.grid(axis='y', alpha=0.3)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
# 3. 시간 비교
ax3 = axes[1, 0]
pivot_time = all_results.pivot_table(values='Time', index='Model', columns='Method')
pivot_time[['Optuna', 'GridSearch', 'RandomSearch']].plot(kind='bar', ax=ax3,
color=['#FF6B6B', '#4ECDC4', '#95E1D3'])
ax3.set_title('탐색 시간 비교', fontsize=12, fontweight='bold')
ax3.set_ylabel('Time (seconds)')
ax3.set_xlabel('')
ax3.legend(title='Method')
ax3.grid(axis='y', alpha=0.3)
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)
# 4. 방법별 평균 성능
ax4 = axes[1, 1]
method_avg_acc = all_results.groupby('Method')['Test Accuracy'].mean().sort_values(ascending=False)
colors_avg = ['#FF6B6B', '#95E1D3', '#4ECDC4']
method_avg_acc.plot(kind='barh', ax=ax4, color=colors_avg)
ax4.set_xlabel('Average Test Accuracy')
ax4.set_title('방법별 평균 성능', fontweight='bold')
ax4.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
댓글
댓글 쓰기