머신 러닝 (8) 분류
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
pima_columns = ['pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'bmi', 'diabetes_pedigree_function', 'age', 'outcome']
pima_data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
df = pd.read_csv(pima_data_url, names=pima_columns)
# 타겟 변수 확인 (outcome: 0=정상, 1=당뇨병)
print(df['outcome'].value_counts())
print(df['outcome'].value_counts(normalize=True))
# Feature(X)와 Target(y) 분리
X = df.drop('outcome', axis=1)
y = df['outcome']
# 학습용/테스트용 데이터 분할 (8:2)
# stratify=y: 타겟 변수의 클래스 비율을 유지하며 분할
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")
# max_iter=1000: 수렴을 위해 반복 횟수 충분히 설정
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# 테스트 데이터 예측
y_pred = model.predict(X_test)
# 정확도(Accuracy) 출력
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (정확도): {accuracy:.4f}")
# 정확도(Accuracy) 출력
y_pred = model.predict(X_test)
y_pred
댓글
댓글 쓰기