머신 러닝 (6) 선형 회귀
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
diabetes = load_diabetes()
X = diabetes.data #Feature Matrix
y = diabetes.target #Target Vector
feature_names = diabetes.feature_names
print(type(X))
print(X)
print(X.shape)
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
display(df.head())
display(df.info())
plt.figure(figsize=(8, 5))
# 타겟 변수의 분포 확인
plt.subplot(1, 2, 1)
sns.histplot(y, kde=True, color='skyblue')
plt.title("원본 Target Distribution")
plt.xlabel("당뇨병 진행도")
# (참고) 로그 변환 적용 시 분포 변화 확인
y_log = np.log1p(y)
plt.subplot(1, 2, 2)
sns.histplot(y_log, kde=True, color='salmon')
plt.title("Log Transformed Target Distribution")
plt.xlabel("Log(Disease Progression)")
plt.tight_layout()
plt.show()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
lr = LinearRegression()
lr.fit(X_train, y_train)
y_preds = lr.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)
print(f"MSE (Mean Squared Error): {mse:.2f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.2f}")
# (시각화) 실제값 vs 예측값 비교
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_preds, alpha=0.7, color='green')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2) # 대각선 (완벽한 예측선)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted (Linear Regression)")
plt.show()
print(f"절편 (Intercept): {np.round(lr.intercept_, 2)}")
# 회귀 계수를 Series로 만들어 정렬하여 출력
coef_series = pd.Series(data=np.round(lr.coef_, 2), index=feature_names)
coef_series = coef_series.sort_values(ascending=False)
print("\n[Feature별 회귀 계수 (영향력 순)]")
print(coef_series)
print("\n[해석 예시]")
top_feature = coef_series.index[0]
top_coef = coef_series.iloc[0]
print(f"- 가장 양의 영향력이 큰 변수: {top_feature} (계수: {top_coef})")
print(f" -> {top_feature} 수치가 높을수록 당뇨병 진행도가 증가하는 경향이 있음.")
댓글
댓글 쓰기