머신 러닝 (22) Random Forest Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
# 데이터 로드
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(diabetes.feature_names)}")
print(f"\n타겟 통계:")
print(f" 평균: {y.mean():.2f}")
print(f" 범위: {y.min():.0f} ~ {y.max():.0f}")
# 데이터 샘플
print("\n데이터 샘플:")
print(X.head())
print("\n기술 통계:")
print(X.describe())
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"\nTrain 타겟 통계:")
print(f" 평균: {y_train.mean():.2f}")
print(f" 표준편차: {y_train.std():.2f}")
# Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
# 하이퍼파라미터 그리드
param_grid = {
'n_estimators': [100, 200],
'max_depth': [5, 10, None],
'min_samples_split': [2, 5],
'max_features': ['sqrt', 'log2', None]
}
print("Random Forest Regressor 생성")
print(f"\n파라미터 그리드:")
for param, values in param_grid.items():
print(f" {param}: {values}")
total_combinations = 1
for values in param_grid.values():
total_combinations *= len(values)
print(f"\n총 조합 수: {total_combinations}개")
# GridSearchCV
print("Tuning RandomForest Regressor...")
print("(교차검증 수행 중...)\n")
grid_reg = GridSearchCV(
rf_reg,
param_grid,
cv=KFold(5, shuffle=True, random_state=42),
scoring='r2',
n_jobs=-1,
verbose=1
)
grid_reg.fit(X_train, y_train)
print("\nGridSearchCV 완료!")
# 최적 모델 추출
best_reg = grid_reg.best_estimator_
print("[최적화 결과]")
print(f"Best R2 Score (Train CV): {grid_reg.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in grid_reg.best_params_.items():
print(f" {param}: {value}")
# 상위 5개 결과
cv_results = pd.DataFrame(grid_reg.cv_results_)
top_results = cv_results.nlargest(5, 'mean_test_score')[[
'param_n_estimators',
'param_max_depth',
'param_min_samples_split',
'param_max_features',
'mean_test_score',
'std_test_score'
]]
print("\n상위 5개 파라미터 조합:")
print(top_results.to_string(index=False))
# 테스트 데이터 예측
y_pred = best_reg.predict(X_test)
# 평가 지표
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))
print("\n[테스트 성능]")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R2 Score: {r2:.4f}")
print(f"Test MAE: {mae:.4f}")
# 실제값 vs 예측값
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
plt.plot([y_test.min(), y_test.max()],
[y_test.min(), y_test.max()],
'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Actual vs Predicted (R2={r2:.4f}, RMSE={rmse:.4f})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
# 피처 중요도 추출
importances = best_reg.feature_importances_
feature_names = diabetes.feature_names
indices = np.argsort(importances)[::-1]
# 중요도 정렬
importance_df = pd.DataFrame({
'Feature': np.array(feature_names)[indices],
'Importance': importances[indices]
})
print("\n피처 중요도 순위:")
print(importance_df.to_string(index=False))
# 피처 중요도 시각화
plt.figure(figsize=(6, 4))
sns.barplot(x=importances[indices], y=np.array(feature_names)[indices], palette='viridis')
plt.title("Feature Importances (Random Forest Regressor)")
plt.xlabel("Importance (MSE Reduction)")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
# 상위 5개 피처 선택
top_5_features = np.array(feature_names)[indices[:5]]
print(f"상위 5개 피처: {list(top_5_features)}")
# 상위 피처만으로 학습
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]
rf_top5 = RandomForestRegressor(**grid_reg.best_params_, random_state=42)
rf_top5.fit(X_train_top5, y_train)
y_pred_top5 = rf_top5.predict(X_test_top5)
rmse_top5 = np.sqrt(mean_squared_error(y_test, y_pred_top5))
r2_top5 = r2_score(y_test, y_pred_top5)
# 비교
print("\n성능 비교:")
print(f"모든 피처 (10개):")
print(f" RMSE = {rmse:.4f}, R2 = {r2:.4f}")
print(f"\n상위 5개 피처:")
print(f" RMSE = {rmse_top5:.4f}, R2 = {r2_top5:.4f}")
print(f"\nRMSE 차이: {rmse_top5 - rmse:+.4f}")
print(f"R2 차이: {r2_top5 - r2:+.4f}")
# 기본 파라미터로 학습
rf_default = RandomForestRegressor(random_state=42)
rf_default.fit(X_train, y_train)
y_pred_default = rf_default.predict(X_test)
rmse_default = np.sqrt(mean_squared_error(y_test, y_pred_default))
r2_default = r2_score(y_test, y_pred_default)
# 비교
print("\nGridSearch 효과:")
print(f"기본 파라미터:")
print(f" RMSE = {rmse_default:.4f}, R2 = {r2_default:.4f}")
print(f"\n최적 파라미터:")
print(f" RMSE = {rmse:.4f}, R2 = {r2:.4f}")
print(f"\nRMSE 개선: {rmse_default - rmse:+.4f}")
print(f"R2 개선: {r2 - r2_default:+.4f}")
plt.show()
댓글
댓글 쓰기