10. lightGBM | Credit

1. Credit 데이터셋

작업파일

credit.csv

3.04MB

import

import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

데이터 불러오기

credit_df = pd.read_csv('/content/drive/MyDrive/1. KDT/6. 머신러닝 딥러닝/데이터/credit.csv')

credit_df

정보보기

credit_df.info()

필요없는 데이터 지우기

credit_df.drop(['ID', 'Customer_ID', 'Name', 'SSN'], axis=1, inplace=True)

credit_df.info()

신용등급 값보기

credit_df['Credit_Score'].value_counts()

신용등급 라벨인코딩
'Poor':0, 'Standard':1, 'Good':2

credit_df['Credit_Score'] = credit_df['Credit_Score'].replace({'Poor':0, 'Standard':1, 'Good':2})

credit_df.head()

통계정보 보기

credit_df.describe() # 이상치 데이터 확인

신용등급과 리볼링의 상관관계

sns.barplot(x='Payment_of_Min_Amount', y='Credit_Score', data=credit_df)

신용등급과 직업의 상관관계

plt.figure(figsize=(20, 5))

sns.barplot(x='Occupation', y='Credit_Score', data=credit_df)

각 열간의 상관 계수

# corr(): 각 열 간의 상관 계수를 반환

# 피어슨, 켄달-타우, 스피어먼

plt.figure(figsize=(12, 12))

sns.heatmap(credit_df.corr(numeric_only=True), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

문자열열 확인

for i in credit_df.columns:

if credit_df[i].dtype == 'O':

print(i)

전처리 필요

credit_df

문자형 데이터 중 _(오타)를 포함하고 있어서 숫자형이 되지 못한 데이터
_(오타)없애주기

for i in ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Amount_invested_monthly']:

credit_df[i] = pd.to_numeric(credit_df[i].str.replace('_', ''))

credit_df.info()

변경전 ----> 후

['Credit_History_Age'] 데이터 확인 (*숫자형으로 바꿔야 할 필요성)

credit_df['Credit_History_Age']

['Credit_History_Age'] 개월수를 숫자형으로 변환해서 저장

# Credit_History_Age의 데이터를 개월로 변경

# 22 Years and 1 Months -> 22 * 12 + 1

credit_df['Credit_History_Age'] = credit_df['Credit_History_Age'].str.replace(' Months', '')

credit_df['Credit_History_Age'] = pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[0])*12 + pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[1])

credit_df.head()

통계치 보기

credit_df.describe()

나이가 음수인 수 확인 (데이터가 꽤 많음)

credit_df[credit_df['Age'] < 0]

나이가 음수인 값 삭제

credit_df = credit_df[credit_df['Age'] >= 0]

credit_df.describe()

[Age] 나이를 기준으로 오름차순으로 정렬한 후, 마지막 30개의 행을 반환

credit_df.sort_values('Age').tail(30)

'Age' 열에 대한 박스 플롯(Box Plot) 그리기

sns.boxplot(y=credit_df['Age'])

나이가 100살 이상인 데이터 보기(이상치 확인)

credit_df[credit_df['Age'] > 100].sort_values('Age')

나이가 120살 이상인 데이터 삭제 (나이 전처리 완료)

credit_df = credit_df[credit_df['Age'] < 120]

credit_df.describe()

[ Num_Bank_Accounts] 통장개수 값이 비정상적으로 큰 행들만 선택해서 비율보기

len(credit_df[credit_df['Num_Bank_Accounts'] > 10) / len(credit_df)

len(credit_df[credit_df['Num_Bank_Accounts'] > 20]) / len(credit_df)

len(credit_df[credit_df['Num_Bank_Accounts'] > 50]) / len(credit_df)

0.013080339119903108

0.013029853207982847

0.012535048655780966

[ Num_Bank_Accounts] 통장이 10개 이하인 데이터만 남기기 (통장개수 전처리 완료)

credit_df = credit_df[credit_df['Num_Bank_Accounts'] <= 10]

credit_df.describe()

[ Num_Credit_Accounts] 카드 개수 이상치 데이터 확인

credit_df.describe()

[ Num_Credit_Accounts] 카드 개수 비율보기
10개 or 20개 이하 비슷한 비율

len(credit_df[credit_df['Num_Credit_Card'] > 10]) / len(credit_df)

len(credit_df[credit_df['Num_Credit_Card'] > 20]) / len(credit_df)

0.022142379679144383

0.021975267379679145

[ Num_Credit_Accounts] 카드 개수 1개 이상 20개 이하인 데이터만 남기기 (카드개수 전처리 완료)

credit_df = credit_df[credit_df['Num_Credit_Card'] <= 20]

credit_df.describe()

[ Interest_Rate ] 대출 이자율이 너무 높음

credit_df.describe()

[ Interest_Rate ] 대출 이자율 40 이하만 남기기

credit_df = credit_df[credit_df['Interest_Rate'] <= 40]

credit_df.describe()

[ Num_of_Loan ] 대출건수 20개 이하 데이터만 사용해도 되는 것을 확인

len(credit_df[credit_df['Num_of_Loan'] > 10])

len(credit_df[credit_df['Num_of_Loan'] > 20])

len(credit_df[credit_df['Num_of_Loan'] > 30])

len(credit_df[credit_df['Num_of_Loan'] > 40])

61
60
60
60

[ Num_of_Loan ] 대출수 이상치 데이터 확인

credit_df.describe()

[ Num_of_Loan ] 대출건수 0개 이상 20개 이하 데이터만 남기기

credit_df = credit_df[(credit_df['Num_of_Loan'] <= 20) & (credit_df['Num_of_Loan'] >= 0)]

credit_df.describe()

[ Delay_from_due_date ] 연체일이 음수인 것 이상치 데이터

credit_df.describe()

[ Delay_from_due_date ] 연체일이 음수인 데이터 삭제

credit_df = credit_df[credit_df['Delay_from_due_date'] >= 0]

credit_df.describe()

[ Delay_from_due_date ] 연체일 40일 이상인 데이터가 많음

len(credit_df[credit_df['Num_of_Delayed_Payment'] > 30])

len(credit_df[credit_df['Num_of_Delayed_Payment'] > 40])

80
1275

[ Delay_from_due_date ] 연체일 30일 이하인 데이터만 저장

credit_df = credit_df[(credit_df['Num_of_Delayed_Payment'] <= 30) & (credit_df['Num_of_Delayed_Payment'] >= 0)]

credit_df.describe()

[ Num_Credit_Inquiries ] 신용조회에 null 값이 너무 많음

credit_df.info()

[ Num_Credit_Inquiries ] 열(column)의 결측치를 0으로 채우기

credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)

credit_df.info()

[ Credit_History_Age ] 카드사용기간
[ Amount_invested_monthly ] 매월 투자 금액
[ Monthly_Balance ] 월별 잔고
결측값 확인

credit_df.isna().sum()

[ Credit_History_Age ] 카드사용기간 : 데이터가 200~250에 몰려있음

sns.displot(credit_df['Credit_History_Age'])

[ Amount_invested_monthly ] 매월 투자 금액 : 데이터가 0~2000에 몰려있음

sns.displot(credit_df['Amount_invested_monthly'])

[ Monthly_Balance ] 월별 잔고 : 데이터가 200~400에 쏠려있음

sns.displot(credit_df['Monthly_Balance'])

[ Credit_History_Age ] 카드사용기간
[ Amount_invested_monthly ] 매월 투자 금액
[ Monthly_Balance ] 월별 잔고
결측값 중앙값(median)으로 대체
: 데이터가 쏠려있을 경우 중간값이 아닌 50% 값으로 채우는게 적절

credit_df = credit_df.fillna(credit_df.median(numeric_only=True))

credit_df.isna().sum()

중복된 데이터 삭제 필요 / null 값도 처리 필요

credit_df.head()

[ Type_of_Loan ] 'and'를 제거, 결측값을 'No Loan'으로 대체

# Auto Loan, Auto Loan, and Not Specified

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].str.replace('and','')

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].fillna('No Loan')

credit_df.isna().mean()

Type_of_Loan 값을 콤마와 공백(', ')을 기준으로 분리하고
하나의 리스트로 합친 후 중복값 제거

type_list = set(credit_df['Type_of_Loan'].str.split(', ').sum())

type_list

[ Type_of_Loan ] 열에서 각각의 type_list 요소들이 포함되어 있는지 여부를 확인하여 새로운 열을 생성

for i in type_list:

credit_df[i] = credit_df['Type_of_Loan'].apply(lambda x: 1 if i in x else 0)

credit_df.head()

[Type_of_Loan] 열 없앤 후 확인

credit_df.drop('Type_of_Loan', axis=1, inplace=True)

credit_df.info()

문자형 데이터 원핫인코딩 필요성

credit_df.info()

[Occupation] 열에서 '----' 값을 'Unknown'으로 바꿔주기

# Occupation

# '----'를 'Unknown'

credit_df['Occupation'].value_counts()

credit_df['Occupation'] = credit_df['Occupation'].replace('_______', 'Unknown')

credit_df['Occupation'].value_counts()

[Payment_of_Min_Amount] 이상데이터 없음, 값도 3개, 원핫인코딩 바로 가능

# Payment_of_Min_Amount

credit_df['Payment_of_Min_Amount'].value_counts()

[Payment_Behaviour] 열에'!@9#%8'를 'Unknown'로 바꿔주기

# Payment_Behaviour

credit_df['Payment_Behaviour'].value_counts()

credit_df['Payment_Behaviour'] = credit_df['Payment_Behaviour'].replace('!@9#%8', 'Unknown')

credit_df['Payment_Behaviour'].value_counts()

[Occupation] , [Payment_of_Min_Amount] , [Payment_Behaviour] 원핫인코딩

credit_df.info()

세개의 열에 대하여 원핫인코딩하기

# 위 object 원핫인코딩

credit_df = pd.get_dummies(credit_df, columns=['Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'])

credit_df.head()

원핫인코딩 확인

credit_df.info()

* 원핫인코딩 전 (29개열)

* 원핫인코딩 후 (51개열)

데이터 나누기

from sklearn.model_selection import train_test_split

len(credit_df)

10005

학습시키기

X_train, X_test, y_train, y_test = train_test_split(credit_df.drop('Credit_Score', axis=1), credit_df['Credit_Score'], test_size=0.2, random_state=2024)

X_train.shape, y_train.shape

X_test.shape, y_test.shape

2. lightGBM(LGBM)

* Microsoft에서 개발한 트리기반 학습 알고리즘 Gradient Boosting 프레임워크
* 리프 중심 히스토그램 기반 알고리즘
* 작은 데이터셋에도 높은 성능을 보이며, 특히 대용량 데이터셋에서 다른 알고리즘보다 빠르게 학습
* 메모리 사용량이 상대적으로 적은편
* 적은 데이터셋을 사용할 경우 과적합 가능성이 매우 큼(일반적으로 데이터가 10000개 이상은 사용해야 함)
* 초기중단(early stopping)을 지원

1. 리프 중심 히스토그램 기반 알고리즘

* 트리를 균형적으로 분할하는 것이 아니라, 최대한 불균형하게 분할
* 특성들의 분포를 히스토그램으로 나타내고, 해당 히스토그램을 이용하여 빠르게 후보 분할 기준을 선택
* 후보 분할 기준 중에서 최적의 분할 기준으로 선택하기 위해,
데이터 포인트들을 히스토르램에 올바르게 배치하고 이를 이용하여 최적의 분할 기준을 선택

2. GBM(Gradient Boosting Model)

* 순차적으로 모델을 학습시킴
* 첫 번째 모델을 학습시키고,
두번째 모델은 첫번째 모델의 오류를 학습하는 식으로 진행(이런 방식으로 각 모델이 이전 모델의 오류를 보완)
* 부스팅에서는 각 데이터 포인트에 가중치를 부여, 초기에는 모든 데이터 포인트에 동일한 가중치를 부여하지만,
이후 모델이 학습되면서 잘못 예측된 데이터 포인트의 가중치를 증가시켜 다음 모델이 디 데이터 포인트에 더 주의를 기울이도록 함
* 트리가 모두 학습된 후 예측 결과를 결합하여 최종 예측을 만드는데 일반적으로 분류 문제에서는 다수결 투표 방식으로,
회귀 문제에서는 예측값의 평균을 사용

3. 부스팅 모델의 주요 개념

* 약한 학습기(Weak Learner): 단독으로는 성능이 좋지 않은 간단한 모델
(주로 깊이가 앝은 결정 트리, 깊이가 1인 매우 간단한 학습기)을 사용
* 약한 학습기를 순차적으로 학습시키고 그 다음에는 첫번째 학습기의 오류를 보완하는 두 번째 학습기를 학습시킴

학습시키기

from lightgbm import LGBMClassifier

base_model = LGBMClassifier(random_state=2024)

base_model.fit(X_train, y_train)

예측값 생성하고 정확도 측정

pred1 = base_model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

accuracy_score(y_test, pred1)

confusion_matrix(y_test, pred1)

print(classification_report(y_test, pred1))

proba 변수는 base_model이 예측한 각 클래스에 속할 확률을 포함하는 배열 또는 데이터
이 배열은 각 클래스에 대한 예측 확률을 나타냅니다.
각 확률 값은 0과 1 사이의 값으로, 모든 클래스에 대해 합이 1이 되어야 합니다 (다중 클래스 분류의 경우)

proba = base_model.predict_proba(X_test)

proba

5.31741083e-02, 5.47962223e-01, 3.98863668e-01

ROC-AUC(Area Under the Receiver Operating Characteristic Curve) 점수를 계산

roc_auc_score(y_test, proba, multi_class='ovr')

0.8943814823196526

y_test: 실제 클래스 레이블 (타겟 변수)입니다. 모델이 예측한 클래스와 비교하여 ROC 곡선을 계산합니다.
proba: 각 클래스에 속할 확률을 포함하는 배열이나 데이터프레임입니다.
이 확률은 모델이 각 클래스에 속할 것으로 예측한 확률 값입니다.
multi_class='ovr': 다중 클래스 문제에서 사용할 전략을 나타내는 매개변수입니다.
'ovr'은 One-vs-Rest 전략을 의미하며,
이 경우 각 클래스를 다른 모든 클래스와 구분하여 이진 분류 문제로 취급니다.

'AI > 머신러닝' 카테고리의 다른 글

12. K-평균 군집화 (KMeans) \| Marketing (0)	2024.06.17
11. 다양한 모델 성능비교 \| Air Quality UCI (0)	2024.06.17
09. 랜덤 포레스트 (Random Forest) \| Hotel (0)	2024.06.12
08. SVM, Scaling \| 손글씨 (0)	2024.06.12
07. 로지스틱 회귀(Logistic Regression) \| 인사자료 (0)	2024.06.12

leesarr-study

10. lightGBM | Credit

1. Credit 데이터셋

2. lightGBM(LGBM)

'AI > 머신러닝' 카테고리의 다른 글

티스토리툴바

10. lightGBM | Credit

1. Credit 데이터셋

2. lightGBM(LGBM)

'AI > 머신러닝' 카테고리의 다른 글

관련글

티스토리툴바