머신 러닝 (22) Random Forest Regression

 import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 데이터 로드
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target

print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(diabetes.feature_names)}")
print(f"\n타겟 통계:")
print(f"  평균: {y.mean():.2f}")
print(f"  범위: {y.min():.0f} ~ {y.max():.0f}")

# 데이터 샘플
print("\n데이터 샘플:")
print(X.head())

print("\n기술 통계:")
print(X.describe())

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"\nTrain 타겟 통계:")
print(f"  평균: {y_train.mean():.2f}")
print(f"  표준편차: {y_train.std():.2f}")

# Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2', None]
}

print("Random Forest Regressor 생성")
print(f"\n파라미터 그리드:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")

total_combinations = 1
for values in param_grid.values():
    total_combinations *= len(values)
print(f"\n총 조합 수: {total_combinations}개")

# GridSearchCV
print("Tuning RandomForest Regressor...")
print("(교차검증 수행 중...)\n")

grid_reg = GridSearchCV(
    rf_reg,
    param_grid,
    cv=KFold(5, shuffle=True, random_state=42),
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_reg.fit(X_train, y_train)
print("\nGridSearchCV 완료!")

# 최적 모델 추출
best_reg = grid_reg.best_estimator_

print("[최적화 결과]")
print(f"Best R2 Score (Train CV): {grid_reg.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in grid_reg.best_params_.items():
    print(f"  {param}: {value}")

# 상위 5개 결과
cv_results = pd.DataFrame(grid_reg.cv_results_)
top_results = cv_results.nlargest(5, 'mean_test_score')[[
    'param_n_estimators',
    'param_max_depth',
    'param_min_samples_split',
    'param_max_features',
    'mean_test_score',
    'std_test_score'
]]

print("\n상위 5개 파라미터 조합:")
print(top_results.to_string(index=False))

# 테스트 데이터 예측
y_pred = best_reg.predict(X_test)

# 평가 지표
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))

print("\n[테스트 성능]")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R2 Score: {r2:.4f}")
print(f"Test MAE: {mae:.4f}")

# 실제값 vs 예측값
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Actual vs Predicted (R2={r2:.4f}, RMSE={rmse:.4f})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()

# 피처 중요도 추출
importances = best_reg.feature_importances_
feature_names = diabetes.feature_names
indices = np.argsort(importances)[::-1]

# 중요도 정렬
importance_df = pd.DataFrame({
    'Feature': np.array(feature_names)[indices],
    'Importance': importances[indices]
})

print("\n피처 중요도 순위:")
print(importance_df.to_string(index=False))

# 피처 중요도 시각화
plt.figure(figsize=(6, 4))
sns.barplot(x=importances[indices], y=np.array(feature_names)[indices], palette='viridis')
plt.title("Feature Importances (Random Forest Regressor)")
plt.xlabel("Importance (MSE Reduction)")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

# 상위 5개 피처 선택
top_5_features = np.array(feature_names)[indices[:5]]
print(f"상위 5개 피처: {list(top_5_features)}")

# 상위 피처만으로 학습
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]

rf_top5 = RandomForestRegressor(**grid_reg.best_params_, random_state=42)
rf_top5.fit(X_train_top5, y_train)
y_pred_top5 = rf_top5.predict(X_test_top5)

rmse_top5 = np.sqrt(mean_squared_error(y_test, y_pred_top5))
r2_top5 = r2_score(y_test, y_pred_top5)

# 비교
print("\n성능 비교:")
print(f"모든 피처 (10개):")
print(f"  RMSE = {rmse:.4f}, R2 = {r2:.4f}")
print(f"\n상위 5개 피처:")
print(f"  RMSE = {rmse_top5:.4f}, R2 = {r2_top5:.4f}")
print(f"\nRMSE 차이: {rmse_top5 - rmse:+.4f}")
print(f"R2 차이: {r2_top5 - r2:+.4f}")

# 기본 파라미터로 학습
rf_default = RandomForestRegressor(random_state=42)
rf_default.fit(X_train, y_train)
y_pred_default = rf_default.predict(X_test)

rmse_default = np.sqrt(mean_squared_error(y_test, y_pred_default))
r2_default = r2_score(y_test, y_pred_default)

# 비교
print("\nGridSearch 효과:")
print(f"기본 파라미터:")
print(f"  RMSE = {rmse_default:.4f}, R2 = {r2_default:.4f}")
print(f"\n최적 파라미터:")
print(f"  RMSE = {rmse:.4f}, R2 = {r2:.4f}")
print(f"\nRMSE 개선: {rmse_default - rmse:+.4f}")
print(f"R2 개선: {r2 - r2_default:+.4f}")
plt.show()

댓글

이 블로그의 인기 게시물

베이스 캠프에서 (1)

베이스 캠프에서 (2)

Database 분석 (4)