머신 러닝 (14) Feature Engineering regression
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
# 데이터 로드
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(diabetes.feature_names)}")
diabetes = load_diabetes(return_X_y=False)
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target
print(f"데이터 크기: {X.shape}")
print(f"피처 목록: {list(diabetes.feature_names)}")
y
y.dtype
X
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Pipeline 구축
pipe_lasso = Pipeline([
('scaler', StandardScaler()),
('feature_selection', SelectFromModel(LassoCV(cv=5, random_state=42))),
('regressor', Ridge(random_state=42))
])
print("Pipeline 구성 완료")
print(pipe_lasso)
# 학습
print("Training Pipeline with LassoCV Selection...")
pipe_lasso.fit(X_train, y_train)
print("학습 완료!")
# 선택된 피처 확인
selector = pipe_lasso.named_steps['feature_selection']
selected_indices = selector.get_support(indices=True)
selected_features = np.array(diabetes.feature_names)[selected_indices]
print(f"선택된 피처 개수: {len(selected_features)}개 / {len(diabetes.feature_names)}개")
print(f"선택된 피처 목록: {list(selected_features)}")
# LassoCV 모델 추출
lasso_model = selector.estimator_
print(f"최적 Alpha: {lasso_model.alpha_:.6f}")
print(f"\n각 피처별 계수:")
coef_df = pd.DataFrame({
'Feature': diabetes.feature_names,
'Coefficient': lasso_model.coef_,
'Abs_Coef': np.abs(lasso_model.coef_),
'Selected': selector.get_support()
}).sort_values('Abs_Coef', ascending=False)
print(coef_df)
# 예측 및 평가
y_pred = pipe_lasso.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Pipeline R2 Score: {r2:.4f}")
# 모든 피처 사용 (피처 선택 없음)
pipe_all = Pipeline([
('scaler', StandardScaler()),
('regressor', Ridge(random_state=42))
])
pipe_all.fit(X_train, y_train)
y_pred_all = pipe_all.predict(X_test)
r2_all = r2_score(y_test, y_pred_all)
# 결과 비교
print("성능 비교:")
print(f"모든 피처 사용 (10개): R2 = {r2_all:.4f}")
print(f"선택된 피처 ({len(selected_features)}개): R2 = {r2:.4f}")
print(f"성능 차이: {r2 - r2_all:+.4f}")
print(f"\n피처 감소율: {(10 - len(selected_features)) / 10 * 100:.1f}%")
import matplotlib.pyplot as plt
# 실제값 vs 예측값 시각화
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Actual vs Predicted (R2={r2:.4f})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
댓글
댓글 쓰기