lightGBM
2023. 7. 2. 10:02
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
# boston 집값 dataset 불러오기
train = pd.read_csv('train.csv')
# 수치형데이터 취합
train_data = train[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']]
# 주어진 데이터에서 input 데이터와 output(추정하는 값) 으로 구분
X = np.array(train_data.iloc[:, 0:13])
y = np.array(train_data.iloc[:, 13])
# 주어진 데이터에서 train set 과 test set 으로 데이터를 구분
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=70)
evals = [(X_test, y_test)]
# 랜덤포레스트, XGBoost, lightGBM 모델 생성
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgb = LGBMRegressor(random_state=42)
# 모델 학습
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)
# fitting 한 모델을 통해 test data 의 target 변수를 예측
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb.predict(X_test)
y_pred_lgb = lgb.predict(X_test)
# r-squared 값 계산
r2_rf = r2_score(y_test, y_pred_rf)
r2_xgb = r2_score(y_test, y_pred_xgb)
r2_lgb = r2_score(y_test, y_pred_lgb)
# 실측치와 예측치에 대한 scatter plot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,4))
axes[0].scatter(y_test, y_pred_rf)
axes[1].scatter(y_test, y_pred_xgb)
axes[2].scatter(y_test, y_pred_lgb)
# Set titles for each subplot
axes[0].set_title("Predicted MEDV values by RF")
axes[0].set_xlabel("Actual MEDV values")
axes[0].set_ylabel("Predicted MEDV values")
axes[1].set_title("Predicted MEDV values by XGB")
axes[1].set_xlabel("Actual MEDV values")
axes[1].set_ylabel("Predicted MEDV values")
axes[2].set_title("Predicted MEDV values by LGBM")
axes[2].set_xlabel("Actual MEDV values")
axes[2].set_ylabel("Predicted MEDV values")
# 추세선 및 r-squared value 표시
z = np.polyfit(y_test, y_pred_rf, 1)
p = np.poly1d(z)
axes[0].plot(y_test, p(y_test), "r--")
axes[0].text(20, 48, "R-squared = {:.3f}".format(r2_rf))
z = np.polyfit(y_test, y_pred_xgb, 1)
p = np.poly1d(z)
axes[1].plot(y_test, p(y_test), "r--")
axes[1].text(20, 48, "R-squared = {:.3f}".format(r2_xgb))
z = np.polyfit(y_test, y_pred_lgb, 1)
p = np.poly1d(z)
axes[2].plot(y_test, p(y_test), "r--")
axes[2].text(20, 48, "R-squared = {:.3f}".format(r2_lgb))
axes[0].grid(True)
axes[1].grid(True)
axes[2].grid(True)
plt.show()
# # 모델 성능 평가
# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# print("Mean Squared Error: {:.2f}".format(mse))
# print("Mean Absolute Error: {:.2f}".format(mae))
# print("R-squared Score: {:.2f}".format(r2))
# # column 별 가중치 계산
# feature_importances = pd.DataFrame({'feature': train_data.iloc[:, 0:13].columns,
# 'importance': rf.feature_importances_})
# feature_importances = feature_importances.sort_values('importance', ascending=False)
# print(feature_importances)
'Python' 카테고리의 다른 글
[Python] 2D IDW Interpolation (2차원 역거리 가중치 보간법) (1) | 2023.06.11 |
---|---|
[Python] GridSearchCV 를 이용한 하이퍼파라미터 최적값 찾기 (0) | 2023.03.19 |
[Python] 데이터 전처리 - 스케일링 (0) | 2023.03.18 |
[Python] Random Forest 최적화 방법 (0) | 2023.03.13 |
머신 러닝 알고리즘 : 선형 데이터 (Regression) - 뉴럴네트워크(MLP) (0) | 2023.03.09 |