AI 헬스케어 첫번째 미니 프로젝트 “흡연 여부 데이터 분석을 통한 건강 인사이트 도출”(1)

개인 과제와 팀 과제로 이루어졌다.

아래는 개인 과제의 내용이다.

흡연 여부 데이터를 분석하고 시각화하여 통계적 검정을 통해 변수 간의 관계를 규명해야 했다.

불러온 라이브러리는 다음과 같다.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

실습 코드를 불러온 뒤

health_data.shape

shape 확인

health_data.describe()

health_data.info()

내용 및 정보 확인 등

# 예시: health_data에 'label'이라는 컬럼이 있다고 가정
label_counts = health_data['label'].value_counts()
label_0_count = label_counts.get(0, 0) # 라벨 0의 개수를 가져오고, 없으면 0 반환
print(f'라벨이 0인 데이터 수: {label_0_count}개')

# 예시: health_data에 'label'이라는 컬럼이 있다고 가정
label_counts = health_data['label'].value_counts()
label_1_count = label_counts.get(1, 0) # 라벨 0의 개수를 가져오고, 없으면 0 반환
print(f'라벨이 1인 데이터 수: {label_1_count}개')

health_data.head()

health_data.tail()

if 문 사용

def BMI_status(BMI_value):
    if BMI_value < 18.5:
        return '저체중'
    elif BMI_value < 25:
        return '정상'
    elif BMI_value < 30:
        return '과체중'
    else:
        return '비만'
health_data['BMI 상태 분류'] = health_data['BMI'].apply(BMI_status)
status_counts = health_data['BMI 상태 분류'].value_counts()

print(status_counts)

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
status_counts.plot(kind='bar', color=['blue', 'green', 'orange', 'red'])

plt.title('BMI 상태별 인원 분포')
plt.xlabel('BMI 상태')
plt.ylabel('인원 수')
plt.xticks(rotation=0)

plt.show()

BMI 상태 별 인원 확인

다시 if 문 사용

def age_group(age):
    if age <= 30:
      return "30대 이하"
    elif age <=50:
      return "30~50대"
    elif age <=70:
      return "50~70대"
    else:
      return "70대 이상"
health_data['나이대'] = health_data['나이'].apply(age_group)
age_group_counts = health_data['나이대'].value_counts()

print(age_group_counts)

나이대 별 인원

plt.figure(figsize=(10, 6)) # 그래프 크기 설정

# plot(kind='bar')를 사용하여 막대 그래프 생성
age_group_counts.plot(kind='bar', color='skyblue')

# 그래프 제목 및 축 라벨 설정
plt.title('나이대별 인원 분포')
plt.xlabel('나이대')
plt.ylabel('인원 수')

# x축 레이블 회전 (가독성을 위해)
plt.xticks(rotation=0)

# 그래프 표시
plt.show()

단변량 분석에서 파악한 내용

### 단변량 분석에서 파악한 내용을 정리해보세요.
#공복 혈당       140
#혈압          140
#중성 지방       140
#충치          5408
#공복 혈당, 혈압, 중성 지방의 경우 측정치가 누락된 것으로 보인다.
#충치의 경우 없을수도 있지만 누락된 것일수도 있다.
#충치치료는 2022년 기준, 충치 치료를 받은 환자 비율은 전체 인구의 10.7%(건강보험심사평가원)
#Nearly 90% of adults ages 20 to 64 years have had decay in their teeth, a percentage that has not changed significantly between the 1999–2004 and 2011–2016 NHANES cycles.
#시력, 공복 혈당이나 혈압, 중성 지방이 누락되었다고 평균값이나 중앙값으로 채우면
#흡연이 해당 값과 연관이 없어 보일 수 있는 오류가 발생할 수 있다.

계산

print(f'공복 혈당 누락: {140/7000*100}%')
print(f'혈압 누락: {140/7000*100}%')
print(f'중성 지방 누락: {140/7000*100}%')
print(f'전체 데이터 중 충치가 없는 데이터 퍼센트: {5408/7000*100:.2f}%')
print(f'건강보험심사평가원 1년 통계(2022년) 기준 있어야 할 충치 환자 데이터 수: {7000*0.107}')
print(f'NHANES 기준 있어야 할 충치 환자 데이터 수: {7000*0.9}')
print(f'충치 데이터 수: {7000-5408}')

가설

# 가설 1. 흡연자는 저밀도지단백이 정상 범위에서 벗어나 있을 것이다. (저밀도지단백은 100 mg/dL 미만이어야 한다. 130mg/dL부터 경계선이다.)
# 가설 2. 흡연자는 혈청 크레아틴 수치가 정상 범위에서 벗어나 있을 것이다. (남성의 경우 0.7–1.2 mg/dL, 여성의 경우 0.5–1.0 mg/dL이 정상이다.)
# 가설 3. 흡연자는 간 효소 수치가 정상 범위에서 벗어나 있을 것이다. (주어진 간 효소는 빌리루빈(Bilirubin)으로 가정하고 0.1에서 1.2mg/dL을 정상으로 잡는다.)

https://www.merckmanuals.com/home/hormonal-and-metabolic-disorders/cholesterol-disorders/dyslipidemia?aiquery=ldl%20normal%20range

https://www.mayoclinic.org/tests-procedures/creatinine-test/about/pac-20384646

https://redcliffelabs.com/myhealth/lab-test/understanding-the-bilirubin-blood-test-a-complete-guide/

https://www.hira.or.kr/bbsDummy.do?pgmid=HIRAA020041000100&brdScnBltNo=4&brdBltNo=10925

https://www.nidcr.nih.gov/research/data-statistics/dental-caries/adults#:~:text=Nearly%2090%25%20of%20adults%20ages%2020%20to%2064,prevalence%20of%20decay%20%2896%25%29%20in%20both%20NHANES%20cycles

저밀도지단백에 대한 if 문 작성

ldl = health_data['저밀도지단백']

def LDL_norm(ldl):
    if ldl < 130:
        return '정상'
    elif 130 <= ldl < 160:
        return '경계선'
    else:
        return '위험군'

health_data['저밀도지단백 상태 분류'] = health_data['저밀도지단백'].apply(LDL_norm)
ldl_status_counts = health_data['저밀도지단백 상태 분류'].value_counts()

print(ldl_status_counts)

주어진 자료가 흡연자와 비흡연자 그룹으로 나뉘어 있어 그에 따른 막대 그래프를 불러 옴.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 자료가 이미 로드되어 있다고 가정합니다 (health_data DataFrame 사용 가능)
# 예: health_data = pd.read_csv('your_health_data.csv')

# ----------------------------------------------------
# 1단계: 제공된 LDL 분류 함수 및 적용
# ----------------------------------------------------
ldl = health_data['저밀도지단백']

def LDL_norm(ldl_value):
    if ldl_value < 130:
        return '정상'
    elif 130 <= ldl_value < 160:
        return '경계선'
    else:
        return '위험군'

health_data['저밀도지단백 상태 분류'] = health_data['저밀도지단백'].apply(LDL_norm)
ldl_status_counts = health_data['저밀도지단백 상태 분류'].value_counts()

# ----------------------------------------------------
# 2단계: 3가지 분류 내 0/1 라벨 분포 확인 및 시각화
# ----------------------------------------------------

print("--- LDL 상태별 Label (0 또는 1) 빈도수 교차표 ---")
cross_tabulation = pd.crosstab(health_data['저밀도지단백 상태 분류'], health_data['label'])
print(cross_tabulation)
print("\n" + "="*50 + "\n")


plt.figure(figsize=(8, 5))

# seaborn의 countplot을 사용하여 두 변수의 관계를 그룹화된 막대로 표시
sns.countplot(
    x='저밀도지단백 상태 분류',
    hue='label', # 색상으로 0 또는 1 라벨 구분
    data=health_data,
    order=['정상', '경계선', '위험군']
)

plt.title('LDL 상태 분류별 Label (0 또는 1) 분포')
plt.xlabel('저밀도지단백 상태 분류')
plt.ylabel('데이터 수 (빈도)')
plt.legend(title='Label', loc='upper right')
plt.show()

자료가 흡연자와 비흡연자로 나뉘어 있어 그에 따라 분류했는데 퍼센트로 계산해 다시 시각화

print(f'경계선: 비흡연(라벨 0){962/4429*100:.2f}% 흡연(라벨 1){545/2571*100:.2f}%\n\
위험군: 비흡연(라벨 0){426/4429*100:.2f}% 흡연(라벨 1){246/2571*100:.2f}%\n\
정상: 비흡연(라벨0){3041/4429*100:.2f}% 흡연(라벨 1){1780/2571*100:.2f}%')

import matplotlib.pyplot as plt
import numpy as np

# 데이터 준비
labels = ['경계선', '위험군', '정상']
non_smoker_percents = [21.72, 9.62, 68.86]  # 비흡연 비율
smoker_percents = [21.20, 9.57, 69.23]    # 흡연 비율

x = np.arange(len(labels))  # 라벨 위치
width = 0.35  # 막대 폭

fig, ax = plt.subplots(figsize=(10, 6))

# 비흡연 막대 그래프
rects1 = ax.bar(x - width/2, non_smoker_percents, width, label='비흡연 (라벨 0)', color='#4c72b0')
# 흡연 막대 그래프
rects2 = ax.bar(x + width/2, smoker_percents, width, label='흡연 (라벨 1)', color='#dd8452')

# 그래프에 텍스트 라벨, 제목 및 범례 추가
ax.set_xlabel('LDL', fontsize=12)
ax.set_ylabel('비율 (%)', fontsize=12)
ax.set_title('비흡연자, 흡연자 LDL 비교', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=10)
ax.legend()

# 각 막대 위에 비율 텍스트 추가 (수정된 부분)
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        # xy 인자에 주석 위치 좌표 전달 (막대 상단 중앙)
        ax.annotate(f'{height:.2f}%',
                    xy=(rect.get_x() + rect.get_width() / 2, height), # 이 부분이 수정되었습니다.
                    ha='center', va='bottom', fontsize=9,
                    xytext=(0, 5), textcoords="offset points")

autolabel(rects1)
autolabel(rects2)

# 레이아웃 조정 및 그래프 표시
fig.tight_layout()
plt.show()

비슷한 과정을 반복하여 혈청 크레아티닌이 3가지 가설 중 유의미한 것으로 막대 그래프화

# 1. 제공된 데이터를 DataFrame으로 구성
# 비율 계산 값 (소수점 둘째 자리 반올림 기준)
# 높음: 비흡연 28.20%, 흡연 48.39%
# 정상: 비흡연 71.80%, 흡연 51.61%

data = {
    '크레아티닌 상태': ['높음', '높음', '정상', '정상'],
    '흡연 여부 (Label)': ['비흡연 (0)', '흡연 (1)', '비흡연 (0)', '흡연 (1)'],
    '비율 (%)': [28.20, 48.39, 71.80, 51.61]
}
df_rates = pd.DataFrame(data)

print("--- 비율 데이터셋 ---")
print(df_rates)
print("\n" + "="*50 + "\n")

# 2. 그래프 설정
plt.figure(figsize=(10, 6))

# Seaborn barplot을 사용하여 시각화
sns.barplot(
    x='크레아티닌 상태',
    y='비율 (%)',
    hue='흡연 여부 (Label)',
    data=df_rates,
    order=['정상', '높음'], # 순서 지정
    palette='Paired' # 색상 팔레트 설정
)

# 3. 그래프 제목 및 레이블 설정
plt.title('혈청 크레아티닌 상태별 흡연 여부 비율 비교', fontsize=16)
plt.xlabel('혈청 크레아티닌 상태', fontsize=12)
plt.ylabel('비율 (%)', fontsize=12)
plt.legend(title='흡연 여부', loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 4. 각 막대에 비율 텍스트 추가
for index, row in df_rates.iterrows():
    # 위치 조정이 필요할 수 있습니다.
    plt.text(
        x=0 if row['크레아티닌 상태'] == '정상' else 1, # x축 위치 (정상=0, 높음=1)
        y=row['비율 (%)'] + 1, # y축 높이 + 간격
        s=f'{row["비율 (%)"]:.1f}%', # 표시할 텍스트
        color='black',
        ha='center', # 중앙 정렬
        va='bottom'
    )

plt.show()