lightGBM

2023. 7. 2. 10:02
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# boston 집값 dataset 불러오기
train = pd.read_csv('train.csv')

# 수치형데이터 취합
train_data = train[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
                    'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']]

# 주어진 데이터에서 input 데이터와 output(추정하는 값) 으로 구분
X = np.array(train_data.iloc[:, 0:13])
y = np.array(train_data.iloc[:, 13])

# 주어진 데이터에서 train set 과 test set 으로 데이터를 구분
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=70)

evals = [(X_test, y_test)]

# 랜덤포레스트, XGBoost, lightGBM 모델 생성
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgb = LGBMRegressor(random_state=42)

# 모델 학습
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)


# fitting 한 모델을 통해 test data 의 target 변수를 예측
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb.predict(X_test)
y_pred_lgb = lgb.predict(X_test)

# r-squared 값 계산
r2_rf = r2_score(y_test, y_pred_rf)
r2_xgb = r2_score(y_test, y_pred_xgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# 실측치와 예측치에 대한 scatter plot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,4))
axes[0].scatter(y_test, y_pred_rf)
axes[1].scatter(y_test, y_pred_xgb)
axes[2].scatter(y_test, y_pred_lgb)

# Set titles for each subplot
axes[0].set_title("Predicted MEDV values by RF")
axes[0].set_xlabel("Actual MEDV values")
axes[0].set_ylabel("Predicted MEDV values")
axes[1].set_title("Predicted MEDV values by XGB")
axes[1].set_xlabel("Actual MEDV values")
axes[1].set_ylabel("Predicted MEDV values")
axes[2].set_title("Predicted MEDV values by LGBM")
axes[2].set_xlabel("Actual MEDV values")
axes[2].set_ylabel("Predicted MEDV values")


# 추세선 및 r-squared value 표시
z = np.polyfit(y_test, y_pred_rf, 1)
p = np.poly1d(z)
axes[0].plot(y_test, p(y_test), "r--")
axes[0].text(20, 48, "R-squared = {:.3f}".format(r2_rf))

z = np.polyfit(y_test, y_pred_xgb, 1)
p = np.poly1d(z)
axes[1].plot(y_test, p(y_test), "r--")
axes[1].text(20, 48, "R-squared = {:.3f}".format(r2_xgb))

z = np.polyfit(y_test, y_pred_lgb, 1)
p = np.poly1d(z)
axes[2].plot(y_test, p(y_test), "r--")
axes[2].text(20, 48, "R-squared = {:.3f}".format(r2_lgb))


axes[0].grid(True)
axes[1].grid(True)
axes[2].grid(True)
plt.show()

# # 모델 성능 평가
# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Mean Squared Error: {:.2f}".format(mse))
# print("Mean Absolute Error: {:.2f}".format(mae))
# print("R-squared Score: {:.2f}".format(r2))

# # column 별 가중치 계산
# feature_importances = pd.DataFrame({'feature': train_data.iloc[:, 0:13].columns, 
#                                     'importance': rf.feature_importances_})
# feature_importances = feature_importances.sort_values('importance', ascending=False)
# print(feature_importances)

BELATED ARTICLES

more