머신 러닝 (7) Regularization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
X, y = load_diabetes(return_X_y=True)
# 학습용/테스트용 데이터 분할 (8:2)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
# 학습 데이터 기준으로 스케일러 학습 및 변환
X_train_scaled = scaler.fit_transform(X_train)
# 테스트 데이터는 학습 데이터의 기준(mean, std)으로 변환만 수행
X_test_scaled = scaler.transform(X_test)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
ridge_pred = ridge.predict(X_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_pred)
print(f"Ridge MSE: {ridge_mse:.2f}")
print(f"Ridge 회귀 계수:\n{np.round(ridge.coef_, 2)}")
print("-> 모든 변수의 계수가 0이 되지 않고 유지됨 (Shrinkage)")
lasso = Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train)
lasso_pred = lasso.predict(X_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_selected_features = np.sum(lasso.coef_ != 0)
print(f"Lasso MSE: {lasso_mse:.2f}")
print(f"Lasso 회귀 계수:\n{np.round(lasso.coef_, 2)}")
print(f"Lasso가 선택한 변수 개수: {lasso_selected_features} / {X.shape[1]}")
print("-> 일부 변수의 계수가 정확히 0이 됨 (Feature Selection)")
# alpha: 규제 강도, l1_ratio: L1 규제의 비율 (0~1 사이)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_net.fit(X_train_scaled, y_train)
el_pred = elastic_net.predict(X_test_scaled)
el_mse = mean_squared_error(y_test, el_pred)
print(f"ElasticNet(Basic) MSE: {el_mse:.2f}")
print(f"ElasticNet 회귀 계수:\n{np.round(elastic_net.coef_, 2)}")
feature_names = load_diabetes().feature_names
plt.figure(figsize=(6, 4))
plt.plot(feature_names, ridge.coef_, 's-', label='Ridge (L2)')
plt.plot(feature_names, lasso.coef_, 'x-', label='Lasso (L1)')
plt.plot(feature_names, elastic_net.coef_, 'o-', label='ElasticNet')
plt.axhline(0, color='black', linestyle='--', linewidth=0.8)
plt.title("Regularization 회귀 계수 비교")
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.legend()
plt.show()
댓글
댓글 쓰기