머신 러닝 (5) EDA
차트 작성 시 한글 깨짐 방지를 위한 koreanize-matplotlib 설치
!pip install koreanize-matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
pima_columns = ['pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome']
pima_data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
df = pd.read_csv(pima_data_url, names=pima_columns)
df.shape
df.info()
# 1. 결측치(NaN) 확인
missing_values = df.isnull().sum()
display(missing_values)
# 2. 숨겨진 결측치 (0값) 탐색
zero_cols = ['glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'bmi']
for col in zero_cols:
zero_count = (df[col] == 0).sum()
print(f" - {col}: {zero_count}개 ({zero_count/len(df)*100:.2f}%)")
# 수치형 데이터 요약
df.describe()
df.hist(figsize=(12, 10), bins=20, edgecolor='black')
plt.suptitle("전체 변수 히스토그램", fontsize=16)
plt.tight_layout()
plt.show()
# 빈도 및 비율 확인
outcome_counts = df['outcome'].value_counts()
outcome_ratios = df['outcome'].value_counts(normalize=True)
print(f"0 (정상): {outcome_counts[0]}명 ({outcome_ratios[0]*100:.1f}%)")
print(f"1 (당뇨): {outcome_counts[1]}명 ({outcome_ratios[1]*100:.1f}%)")
# 시각화 (Count Plot)
plt.figure(figsize=(2, 4))
sns.countplot(x='outcome', data=df, palette='pastel')
plt.title('Distribution of Outcome (Target Variable)')
plt.xlabel('Outcome (0: Normal, 1: Diabetes)')
plt.ylabel('Count')
plt.show()
# Box Plot 시각화
plt.figure(figsize=(6, 3))
sns.boxplot(x=df['blood_pressure'], color='lightskyblue')
plt.title('Box Plot of Blood Pressure')
plt.show()
# IQR 계산 로직
Q1 = df['blood_pressure'].quantile(0.25)
Q3 = df['blood_pressure'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"IQR: {IQR}")
print(f"이상치 하한 경계: {lower_bound}")
print(f"이상치 상한 경계: {upper_bound}")
# 상관계수 행렬 계산
corr_matrix = df.corr()
# 히트맵 시각화
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Variables')
plt.show()
plt.figure(figsize=(6, 4))
# 변경된 컬럼명 glucose, outcome 적용
sns.kdeplot(data=df[df['outcome'] == 0]['glucose'], label='Non-Diabetic (0)', fill=True, color='blue', alpha=0.3)
sns.kdeplot(data=df[df['outcome'] == 1]['glucose'], label='Diabetic (1)', fill=True, color='orange', alpha=0.3)
plt.title('Glucose Distribution by Outcome')
plt.xlabel('Glucose')
plt.ylabel('Density')
plt.legend()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
# Box Plot
sns.boxplot(x='outcome', y='age', data=df, ax=axes[0], palette='Set2')
axes[0].set_title('Box Plot: Age vs Outcome')
# Violin Plot
sns.violinplot(x='outcome', y='age', data=df, ax=axes[1], palette='Set2')
axes[1].set_title('Violin Plot: Age vs Outcome')
plt.tight_layout()
plt.show()
댓글
댓글 쓰기