Create a model using the features created below. Kaggle House Prices ① ~ Feature Engineering ~
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
def load_x_train() -> pd.DataFrame:
"""Read the features of the training data created in advance
:return:Features of training data
"""
return joblib.load('train_x.pkl')
def load_y_train() -> pd.Series:
"""Read the objective variable of the training data created in advance
:return:Objective variable of training data
"""
#Read the objective variable
train_y = joblib.load('train_y.pkl')
#Logarithmic conversion of the objective variable
train_y = np.log1p(train_y)
return train_y
def load_index_fold(i_fold: int) -> np.array:
"""Returns the index of the corresponding record with fold in cross-validation
:param i_fold:fold number
:return:Index of records corresponding to fold
"""
#Returns an index that separates training data and validation data
#Here, the random number is fixed and created every time, but there is also a method to save it in a file.
train_y = load_y_train()
kf = KFold(n_splits=4, random_state=6, shuffle=True)
return list(kf.split(train_y))[i_fold]
def train_fold(i_fold):
"""Perform learning / evaluation by specifying fold for cross-validation
In addition to calling from other methods, it is also used for confirmation and parameter adjustment by itself.
:param i_fold:fold number
:return:Tuple (model instance, record index, predicted value, score by evaluation)
"""
#Reading training data
train_x = load_x_train()
print(train_x.shape)
train_y = load_y_train()
#Set training data and validation data
tr_idx, va_idx = load_index_fold(i_fold)
print(tr_idx.shape)
print(va_idx.shape)
tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
va_x, va_y = train_x.iloc[va_idx], train_y.iloc[va_idx]
#Do learning
params_lgbm = {
"boosting_type": "gbdt",
"objective": "regression",
"metric": "rmse",
"learning_rate": 0.05,
"max_depth": 4,
"colsample_bytree": 0.9,
"subsample": 0.9,
"reg_alpha": 0.1,
"reg_lambda": 0.0,
"min_child_weight": 1,
"num_leaves": 31
}
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train)
model = lgb.train(
params_lgbm, lgb_train,
#Pass the model evaluation data
valid_sets=lgb_eval,
#Learn up to 1000 rounds
num_boost_round=1000,
#Stop learning if performance does not improve after 10 rounds
early_stopping_rounds=10
)
#Predict and evaluate validation data
va_pred = model.predict(va_x)
score = np.sqrt(mean_squared_error(va_y, va_pred))
#Returns model, index, forecast, rating
return model, va_idx, va_pred, score
#Perform cross-validation learning and evaluation
scores = []
va_idxes = []
preds = []
n_fold = 4
#Learn in each fold
for i_fold in range(n_fold):
#Do learning
print(f'fold {i_fold} - start training')
model, va_idx, va_pred, score = train_fold(i_fold)
print(f'fold {i_fold} - end training - score {score}')
#Save the model
# model.save_model()
joblib.dump(model, f'model-{i_fold}.pkl')
#Hold the result
va_idxes.append(va_idx)
scores.append(score)
preds.append(va_pred)
#Summarize the results of each fold
va_idxes = np.concatenate(va_idxes)
order = np.argsort(va_idxes)
preds = np.concatenate(preds, axis=0)
preds = preds[order]
print(f'end training cv - score {np.mean(scores)}')
#Saving prediction results
joblib.dump(preds, 'pred-train.pkl')
#Evaluation results
print('result_scores', scores)
Recommended Posts