머신 러닝(28) GridSearch RandomSearch

 !pip install catboost koreanize-matplotlib -q


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import warnings
import time
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Loading Pima Indians Diabetes Dataset...")
pima = fetch_openml(name='diabetes', version=1, as_frame=True)
X = pima.data
# 'tested_negative'와 'tested_positive'를 0과 1로 매핑
y = pima.target.map({'tested_negative': 0, 'tested_positive': 1}).astype(int)

print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(X.columns)}")
print(f"\n클래스 분포:\n{y.value_counts()}")
print(f"\n클래스 비율:\n{y.value_counts(normalize=True).round(3)}")

print("\n데이터 샘플:")
print(X.head())

print("\n기술 통계:")
print(X.describe())

# 데이터 분할 (stratify로 클래스 비율 유지)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"\nTrain 클래스 분포:\n{y_train.value_counts()}")

# 기본 모델 정의 (default 하이퍼파라미터)
models = {
    'XGBoost': XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ),
    'LightGBM': LGBMClassifier(
        random_state=42,
        verbose=-1
    ),
    'CatBoost': CatBoostClassifier(
        random_state=42,
        verbose=0
    )
}

print("기본 모델 성능 평가 중...\n")
print(f"{'Model':<15} | {'Train Acc':<10} | {'Test Acc':<10} | {'Test F1':<10} | {'Time(s)':<10}")
print("-" * 70)

baseline_results = []

for name, model in models.items():
    # 학습 시간 측정
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    # 예측 및 평가
    train_acc = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred, average='weighted')

    baseline_results.append({
        'Model': name,
        'Train Accuracy': train_acc,
        'Test Accuracy': test_acc,
        'Test F1-Score': test_f1,
        'Time': train_time
    })

    print(f"{name:<15} | {train_acc:<10.4f} | {test_acc:<10.4f} | {test_f1:<10.4f} | {train_time:<10.2f}")

baseline_df = pd.DataFrame(baseline_results)
print("\nBaseline 평가 완료!")

# 모델별 파라미터 그리드
param_grids = {
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50],
        'max_depth': [-1, 10, 20],
        'min_child_samples': [20, 30]
    },
    'CatBoost': {
        'iterations': [100, 200],
        'depth': [4, 6, 8],
        'learning_rate': [0.01, 0.05, 0.1],
        'l2_leaf_reg': [1, 3, 5]
    }
}

# 조합 수 계산
print("GridSearchCV 파라미터 조합 수:")
for model_name, grid in param_grids.items():
    total_combinations = 1
    for param, values in grid.items():
        total_combinations *= len(values)
    print(f"{model_name}: 총 {total_combinations}개 조합")

models = {
    'XGBoost': XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ),
    'LightGBM': LGBMClassifier(
        random_state=42,
        verbose=-1
    ),
    'CatBoost': CatBoostClassifier(
        random_state=42,
        verbose=0
    )
}
print("\nGridSearchCV 시작...\n")
print(f"{'Model':<15} | {'Best Score':<12} | {'Test Acc':<10} | {'Test F1':<10} | {'Time(s)':<10}")
print("-" * 75)

grid_results = []

for name, model in models.items():
    # GridSearchCV 설정
    grid_search = GridSearchCV(
        model,
        param_grids[name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )

    # 탐색 실행
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    search_time = time.time() - start_time

    # 최적 모델 평가
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred, average='weighted')

    grid_results.append({
        'Model': name,
        'Method': 'GridSearch',
        'Best CV Score': best_score,
        'Test Accuracy': test_acc,
        'Test F1-Score': test_f1,
        'Search Time': search_time,
        'Best Params': grid_search.best_params_
    })

    print(f"{name:<15} | {best_score:<12.4f} | {test_acc:<10.4f} | {test_f1:<10.4f} | {search_time:<10.2f}")

grid_df = pd.DataFrame(grid_results)
print("\nGridSearchCV 완료!")

# RandomSearch용 파라미터 분포 (GridSearch보다 넓은 범위)
param_distributions = {
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'num_leaves': [31, 50, 70],
        'max_depth': [-1, 10, 20, 30],
        'min_child_samples': [10, 20, 30]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'l2_leaf_reg': [1, 3, 5, 7]
    }
}

print("RandomizedSearchCV 설정: 각 모델당 50회 무작위 탐색")

models = {
    'XGBoost': XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ),
    'LightGBM': LGBMClassifier(
        random_state=42,
        verbose=-1
    ),
    'CatBoost': CatBoostClassifier(
        random_state=42,
        verbose=0
    )
}
print("\nRandomizedSearchCV 시작...\n")
print(f"{'Model':<15} | {'Best Score':<12} | {'Test Acc':<10} | {'Test F1':<10} | {'Time(s)':<10}")
print("-" * 75)

random_results = []

for name, model in models.items():
    # RandomizedSearchCV 설정
    random_search = RandomizedSearchCV(
        model,
        param_distributions[name],
        n_iter=50,  # 50회 무작위 탐색
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=0
    )

    # 탐색 실행
    start_time = time.time()
    random_search.fit(X_train, y_train)
    search_time = time.time() - start_time

    # 최적 모델 평가
    best_score = random_search.best_score_
    y_pred = random_search.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred, average='weighted')

    random_results.append({
        'Model': name,
        'Method': 'RandomSearch',
        'Best CV Score': best_score,
        'Test Accuracy': test_acc,
        'Test F1-Score': test_f1,
        'Search Time': search_time,
        'Best Params': random_search.best_params_
    })

    print(f"{name:<15} | {best_score:<12.4f} | {test_acc:<10.4f} | {test_f1:<10.4f} | {search_time:<10.2f}")

random_df = pd.DataFrame(random_results)
print("\nRandomizedSearchCV 완료!")

# Baseline + GridSearch + RandomSearch 결과 통합
baseline_df['Method'] = 'Baseline'
baseline_df['Best CV Score'] = baseline_df['Test Accuracy']  # CV 없으므로 Test Acc 사용
baseline_df['Search Time'] = baseline_df['Time']
baseline_df['Best Params'] = None
baseline_df = baseline_df.drop('Time', axis=1)

# 전체 결과 합치기
summary_df = pd.concat([baseline_df, grid_df, random_df], ignore_index=True)

print("\n" + "="*80)
print("전체 결과 요약")
print("="*80)
print(summary_df[['Model', 'Method', 'Test Accuracy', 'Test F1-Score', 'Search Time']].to_string(index=False))

# Baseline 대비 개선률 계산
improvement_data = []

for model_name in ['XGBoost', 'LightGBM', 'CatBoost']:
    baseline_acc = baseline_df[baseline_df['Model'] == model_name]['Test Accuracy'].values[0]
    grid_acc = grid_df[grid_df['Model'] == model_name]['Test Accuracy'].values[0]
    random_acc = random_df[random_df['Model'] == model_name]['Test Accuracy'].values[0]

    grid_improvement = ((grid_acc - baseline_acc) / baseline_acc) * 100
    random_improvement = ((random_acc - baseline_acc) / baseline_acc) * 100

    improvement_data.append({
        'Model': model_name,
        'Baseline Accuracy': baseline_acc,
        'GridSearch Accuracy': grid_acc,
        'RandomSearch Accuracy': random_acc,
        'GridSearch Improvement': grid_improvement,
        'RandomSearch Improvement': random_improvement
    })

improvement_df = pd.DataFrame(improvement_data)

print("\n" + "="*80)
print("성능 개선률 (Baseline 대비 %)")
print("="*80)
print(improvement_df.to_string(index=False))

# 성능 비교 시각화
fig, axes = plt.subplots(2, 2, figsize=(8, 6))

# 1. Test Accuracy 비교
ax1 = axes[0, 0]
pivot_acc = summary_df.pivot_table(values='Test Accuracy', index='Model', columns='Method')
pivot_acc[['Baseline', 'GridSearch', 'RandomSearch']].plot(kind='bar', ax=ax1, color=['lightgray', 'orange', 'lightgreen'])
ax1.set_title('Test Accuracy 비교', fontsize=12, fontweight='bold')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('')
ax1.legend(title='Method')
ax1.grid(axis='y', alpha=0.3)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)

# 2. F1-Score 비교
ax2 = axes[0, 1]
pivot_f1 = summary_df.pivot_table(values='Test F1-Score', index='Model', columns='Method')
pivot_f1[['Baseline', 'GridSearch', 'RandomSearch']].plot(kind='bar', ax=ax2, color=['lightgray', 'orange', 'lightgreen'])
ax2.set_title('F1-Score 비교', fontsize=12, fontweight='bold')
ax2.set_ylabel('F1-Score')
ax2.set_xlabel('')
ax2.legend(title='Method')
ax2.grid(axis='y', alpha=0.3)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

# 3. 탐색 시간 비교
ax3 = axes[1, 0]
pivot_time = summary_df[summary_df['Method'] != 'Baseline'].pivot_table(values='Search Time', index='Model', columns='Method')
pivot_time.plot(kind='bar', ax=ax3, color=['orange', 'lightgreen'])
ax3.set_title('탐색 시간 비교 (GridSearch vs RandomSearch)', fontsize=12, fontweight='bold')
ax3.set_ylabel('Time (seconds)')
ax3.set_xlabel('')
ax3.legend(title='Method')
ax3.grid(axis='y', alpha=0.3)
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)

# 4. 개선률 비교
ax4 = axes[1, 1]
improvement_df[['GridSearch Improvement', 'RandomSearch Improvement']].plot(
    kind='bar', ax=ax4, color=['orange', 'lightgreen']
)
ax4.set_xticklabels(improvement_df['Model'], rotation=0)
ax4.set_title('성능 개선률 (Baseline 대비 %)', fontsize=12, fontweight='bold')
ax4.set_ylabel('Improvement (%)')
ax4.set_xlabel('')
ax4.legend(['GridSearch', 'RandomSearch'])
ax4.grid(axis='y', alpha=0.3)
ax4.axhline(y=0, color='black', linestyle='--', linewidth=0.8)

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("GridSearch vs RandomSearch 최적 하이퍼파라미터 비교")
print("="*80)

for model_name in ['XGBoost', 'LightGBM', 'CatBoost']:
    print(f"\n{'='*60}")
    print(f"[{model_name}]")
    print(f"{'='*60}")

    grid_params = grid_df[grid_df['Model'] == model_name].iloc[0]['Best Params']
    random_params = random_df[random_df['Model'] == model_name].iloc[0]['Best Params']

    print("\n GridSearch 최적 파라미터:")
    for param, value in grid_params.items():
        print(f"   {param}: {value}")

    print("\n RandomSearch 최적 파라미터:")
    for param, value in random_params.items():
        print(f"   {param}: {value}")

    # 성능 비교
    grid_acc = grid_df[grid_df['Model'] == model_name].iloc[0]['Test Accuracy']
    random_acc = random_df[random_df['Model'] == model_name].iloc[0]['Test Accuracy']
    grid_time = grid_df[grid_df['Model'] == model_name].iloc[0]['Search Time']
    random_time = random_df[random_df['Model'] == model_name].iloc[0]['Search Time']

    print(f"\n 성능 비교:")
    print(f"   GridSearch   - Accuracy: {grid_acc:.4f}, Time: {grid_time:.2f}s")
    print(f"   RandomSearch - Accuracy: {random_acc:.4f}, Time: {random_time:.2f}s")
    print(f"   정확도 차이: {abs(grid_acc - random_acc):.4f}")
    print(f"   시간 절감: {((grid_time - random_time) / grid_time * 100):.1f}%")

# 최종 요약 시각화
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. 전체 모델-방법 조합 히트맵 (정확도)
ax1 = fig.add_subplot(gs[0, :])
heatmap_data = summary_df.pivot_table(values='Test Accuracy', index='Model', columns='Method')
sns.heatmap(heatmap_data[['Baseline', 'GridSearch', 'RandomSearch']],
            annot=True, fmt='.4f', cmap='YlOrRd', ax=ax1,
            cbar_kws={'label': 'Test Accuracy'})
ax1.set_title('Test Accuracy Heatmap: All Models × Methods', fontsize=14, fontweight='bold')

# 2. 방법별 평균 정확도
ax2 = fig.add_subplot(gs[1, 0])
method_avg = summary_df.groupby('Method')['Test Accuracy'].mean().sort_values(ascending=False)
colors = ['lightgray', 'lightgreen', 'orange']
method_avg.plot(kind='barh', ax=ax2, color=colors)
ax2.set_xlabel('Average Test Accuracy')
ax2.set_title('Average Performance by Method', fontweight='bold')
ax2.grid(axis='x', alpha=0.3)

# 3. 모델별 평균 정확도
ax3 = fig.add_subplot(gs[1, 1])
model_avg = summary_df.groupby('Model')['Test Accuracy'].mean().sort_values(ascending=False)
model_avg.plot(kind='barh', ax=ax3, color='skyblue')
ax3.set_xlabel('Average Test Accuracy')
ax3.set_title('Average Performance by Model', fontweight='bold')
ax3.grid(axis='x', alpha=0.3)

# 4. 방법별 평균 시간 (Baseline 제외)
ax4 = fig.add_subplot(gs[1, 2])
time_data = summary_df[summary_df['Method'] != 'Baseline']
time_avg = time_data.groupby('Method')['Search Time'].mean().sort_values(ascending=True)
time_avg.plot(kind='barh', ax=ax4, color=['lightgreen', 'orange'])
ax4.set_xlabel('Average Time (seconds)')
ax4.set_title('Average Time by Method', fontweight='bold')
ax4.grid(axis='x', alpha=0.3)

# 5. GridSearch 성능 분포
ax5 = fig.add_subplot(gs[2, 0])
grid_data = summary_df[summary_df['Method'] == 'GridSearch']
ax5.bar(grid_data['Model'], grid_data['Test Accuracy'], color='orange', alpha=0.7)
ax5.set_ylabel('Test Accuracy')
ax5.set_title('GridSearch Results', fontweight='bold')
ax5.set_ylim([0.70, 0.85])
ax5.grid(axis='y', alpha=0.3)

# 6. RandomSearch 성능 분포
ax6 = fig.add_subplot(gs[2, 1])
random_data = summary_df[summary_df['Method'] == 'RandomSearch']
ax6.bar(random_data['Model'], random_data['Test Accuracy'], color='lightgreen', alpha=0.7)
ax6.set_ylabel('Test Accuracy')
ax6.set_title('RandomSearch Results', fontweight='bold')
ax6.set_ylim([0.70, 0.85])
ax6.grid(axis='y', alpha=0.3)

# 7. 성능 향상률
ax7 = fig.add_subplot(gs[2, 2])
improvement_df[['GridSearch Improvement', 'RandomSearch Improvement']].plot(
    kind='bar', ax=ax7, color=['orange', 'lightgreen']
)
ax7.set_xticklabels(improvement_df['Model'], rotation=0)
ax7.set_ylabel('Improvement (%)')
ax7.set_title('Performance Improvement vs Baseline', fontweight='bold')
ax7.legend(['GridSearch', 'RandomSearch'])
ax7.grid(axis='y', alpha=0.3)
ax7.axhline(y=0, color='black', linestyle='--', linewidth=0.8)

plt.suptitle('Comprehensive Analysis: GridSearch vs RandomSearch (Pima Indians Diabetes)',
             fontsize=16, fontweight='bold', y=0.995)
plt.show()

댓글

이 블로그의 인기 게시물

베이스 캠프에서 (1)

베이스 캠프에서 (2)

Database 분석 (4)