Use Kaggle's Credit Card Fraud Detection.
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from sklearn import preprocessing as pp
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
%matplotlib inline
data = pd.read_csv('creditcard.csv')
print(data.shape)
print(data.columns)
print(data.dtypes)
(284807, 31)
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
Time float64
V1 float64
V2 float64
V3 float64
V4 float64
V5 float64
V6 float64
V7 float64
V8 float64
V9 float64
V10 float64
V11 float64
V12 float64
V13 float64
V14 float64
V15 float64
V16 float64
V17 float64
V18 float64
V19 float64
V20 float64
V21 float64
V22 float64
V23 float64
V24 float64
V25 float64
V26 float64
V27 float64
V28 float64
Amount float64
Class int64
dtype: object
data.apply(lambda x: len(x.unique()))
Time 124592
V1 275663
V2 275663
V3 275663
V4 275663
V5 275663
V6 275663
V7 275663
V8 275663
V9 275663
V10 275663
V11 275663
V12 275663
V13 275663
V14 275663
V15 275663
V16 275663
V17 275663
V18 275663
V19 275663
V20 275663
V21 275663
V22 275663
V23 275663
V24 275663
V25 275663
V26 275663
V27 275663
V28 275663
Amount 32767
Class 2
dtype: int64
Of the 284807 transaction data, 492 are fraudulent transactions.
data['Class'].sum()
492
In addition, processing of outliers is omitted this time.
data.isnull().sum()
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
features_to_scale = data_X.drop(['Time'], axis=1).columns
scaler = pp.StandardScaler(copy=True)
data_X.loc[:, features_to_scale] = scaler.fit_transform(data_X[features_to_scale])
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, stratify=data_y)
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
log_reg = LogisticRegression()
model = log_reg
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=[0,1])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
model.fit(X_train_fold, y_train_fold)
log_loss_training = log_loss(y_train_fold, model.predict_proba(X_train_fold)[:,1])
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,:] = model.predict_proba(X_cv_fold)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,1])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_logistic_regression = log_loss(y_train, predictions_based_on_k_folds.loc[:,1])
print('Logistic Regression Log Loss: ', log_loss_logistic_regression)
Training Log Loss: 0.005995557191448456
CV Log Loss: 0.005125568292973096
Training Log Loss: 0.006253549879846522
CV Log Loss: 0.00484099351605527
Training Log Loss: 0.005099537613560319
CV Log Loss: 0.007849849024852518
Training Log Loss: 0.006164376210898366
CV Log Loss: 0.004896801432022977
Training Log Loss: 0.005689191528946416
CV Log Loss: 0.0072969772559491235
Logistic Regression Log Loss: 0.006002037904370599
preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_logistic_regression = preds.copy()
precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])
area_under_ROC = auc(fqr, tqr)
plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()
RFC = RandomForestClassifier(n_estimators=10, class_weight='balanced')
model = RFC
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=[0,1])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
model.fit(X_train_fold, y_train_fold)
log_loss_training = log_loss(y_train_fold, model.predict_proba(X_train_fold)[:,1])
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,:] = model.predict_proba(X_cv_fold)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,1])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_random_forest = log_loss(y_train, predictions_based_on_k_folds.loc[:,1])
print('Random Forest Log Loss: ', log_loss_random_forest)
preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_random_forests = preds.copy()
precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_random_forests = preds.copy()
precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
Training Log Loss: 0.00036571581908744426
CV Log Loss: 0.013690949627129164
Training Log Loss: 0.0004235723689615818
CV Log Loss: 0.00570945955148682
Training Log Loss: 0.00037000075061198505
CV Log Loss: 0.012404725764776376
Training Log Loss: 0.00039448357820150154
CV Log Loss: 0.009696866082135918
Training Log Loss: 0.00039912406259827595
CV Log Loss: 0.008095821155055213
Random Forest Log Loss: 0.009919564436116697
params_xGB = {
'objective':'binary:logistic',
'eval_metric':'logloss'
}
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
dtrain = xgb.DMatrix(data=X_train_fold, label=y_train_fold)
dCV = xgb.DMatrix(data=X_cv_fold)
bst = xgb.cv(params_xGB, dtrain, num_boost_round=2000, nfold=5, early_stopping_rounds=200, verbose_eval=50)
best_rounds = np.argmin(bst['test-logloss-mean'])
bst = xgb.train(params_xGB, dtrain, best_rounds)
log_loss_training = log_loss(y_train_fold, bst.predict(dtrain))
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = bst.predict(dCV)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index, 'prediction'])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_xgboost_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('XGBoost Gradient Boosting Log Loss: ', log_loss_xgboost_gradient_boosting)
params_xGB = {
'objective':'binary:logistic',
'eval_metric':'logloss'
}
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
dtrain = xgb.DMatrix(data=X_train_fold, label=y_train_fold)
dCV = xgb.DMatrix(data=X_cv_fold)
bst = xgb.cv(params_xGB, dtrain, num_boost_round=2000, nfold=5, early_stopping_rounds=200, verbose_eval=50)
best_rounds = np.argmin(bst['test-logloss-mean'])
bst = xgb.train(params_xGB, dtrain, best_rounds)
log_loss_training = log_loss(y_train_fold, bst.predict(dtrain))
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = bst.predict(dCV)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index, 'prediction'])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_xgboost_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('XGBoost Gradient Boosting Log Loss: ', log_loss_xgboost_gradient_boosting)
fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])
area_under_ROC = auc(fqr, tqr)
plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()
[0] train-logloss:0.43780+0.00001 test-logloss:0.43797+0.00002
[50] train-logloss:0.00012+0.00000 test-logloss:0.00260+0.00027
[100] train-logloss:0.00005+0.00000 test-logloss:0.00279+0.00026
[150] train-logloss:0.00004+0.00000 test-logloss:0.00288+0.00026
[200] train-logloss:0.00003+0.00000 test-logloss:0.00292+0.00027
Training Log Loss: 0.000792206342624254
CV Log Loss: 0.0029518170688098444
[0] train-logloss:0.43784+0.00002 test-logloss:0.43805+0.00007
[50] train-logloss:0.00013+0.00000 test-logloss:0.00292+0.00084
[100] train-logloss:0.00005+0.00000 test-logloss:0.00319+0.00095
[150] train-logloss:0.00004+0.00000 test-logloss:0.00328+0.00098
[200] train-logloss:0.00003+0.00000 test-logloss:0.00332+0.00099
Training Log Loss: 0.0005385763160533223
CV Log Loss: 0.001907025524733791
[0] train-logloss:0.43780+0.00002 test-logloss:0.43798+0.00009
[50] train-logloss:0.00011+0.00001 test-logloss:0.00278+0.00088
[100] train-logloss:0.00005+0.00000 test-logloss:0.00299+0.00097
[150] train-logloss:0.00004+0.00000 test-logloss:0.00308+0.00102
[200] train-logloss:0.00003+0.00000 test-logloss:0.00314+0.00104
Training Log Loss: 0.0008465584373083585
CV Log Loss: 0.003197665909513871
[0] train-logloss:0.43785+0.00004 test-logloss:0.43802+0.00006
[50] train-logloss:0.00012+0.00001 test-logloss:0.00293+0.00057
[100] train-logloss:0.00005+0.00000 test-logloss:0.00320+0.00063
[150] train-logloss:0.00004+0.00000 test-logloss:0.00329+0.00065
[200] train-logloss:0.00003+0.00000 test-logloss:0.00335+0.00067
Training Log Loss: 0.0005723772843934578
CV Log Loss: 0.001984392100999932
[0] train-logloss:0.43786+0.00003 test-logloss:0.43803+0.00006
[50] train-logloss:0.00013+0.00001 test-logloss:0.00290+0.00095
[100] train-logloss:0.00005+0.00000 test-logloss:0.00310+0.00104
[150] train-logloss:0.00004+0.00000 test-logloss:0.00316+0.00107
[200] train-logloss:0.00003+0.00000 test-logloss:0.00320+0.00108
Training Log Loss: 0.000810889205186624
CV Log Loss: 0.002501448645341887
XGBoost Gradient Boosting Log Loss: 0.002508469849879866
params_lightGBM = {
'objective':'binary',
'metric':'binary_logloss',
'max_depth':4,
'learning_rate':0.01
}
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_lightgbm_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_lightgbm_gradient_boosting)
params_lightGBM = {
'objective':'binary',
'metric':'binary_logloss',
'max_depth':4,
'learning_rate':0.01
}
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_lightgbm_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_lightgbm_gradient_boosting)
fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])
area_under_ROC = auc(fqr, tqr)
plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()
predictions_test_set_logistic_regression = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
predictions_test_set_logistic_regression.loc[:, 'prediction'] = log_reg.predict_proba(X_test)[:,1]
log_loss_test_set_logistic_regression = log_loss(y_test, predictions_test_set_logistic_regression)
predictions_test_set_random_forests = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
predictions_test_set_random_forests.loc[:, 'prediction'] = RFC.predict_proba(X_test)[:,1]
log_loss_test_set_random_forests = log_loss(y_test, predictions_test_set_random_forests)
predictions_test_set_xgboost_gradient_boosting = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
dtest=xgb.DMatrix(data=X_test)
predictions_test_set_xgboost_gradient_boosting.loc[:, 'prediction'] = bst.predict(dtest)
log_loss_test_set_xgboost_gradient_boosting = log_loss(y_test, predictions_test_set_xgboost_gradient_boosting)
predictions_test_set_light_gbm_gradient_boosting = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
predictions_test_set_light_gbm_gradient_boosting.loc[:, 'prediction'] = gbm.predict(X_test, num_iteration=gbm.best_iteration)
log_loss_test_set_light_gbm_gradient_boosting = log_loss(y_test, predictions_test_set_light_gbm_gradient_boosting)
print('Log Loss of Logistic Regression on Test Set: ', log_loss_test_set_logistic_regression)
print('Log Loss of Random Forests on Test Set: ', log_loss_test_set_random_forests)
print('Log Loss of XGBoost Gradient Boosting on Test Set: ', log_loss_test_set_xgboost_gradient_boosting)
print('Log Loss of LightGBM Gradient Boosting on Test Set: ', log_loss_test_set_light_gbm_gradient_boosting)
Log Loss of Logistic Regression on Test Set: 0.006119615555779465
Log Loss of Random Forests on Test Set: 0.012553984393960552
Log Loss of XGBoost Gradient Boosting on Test Set: 0.003148207871388518
Log Loss of LightGBM Gradient Boosting on Test Set: 0.00333516364607809
prediction_based_on_kfolds_four_models = pd.DataFrame(data=[], index=y_train.index)
prediction_based_on_kfolds_four_models = prediction_based_on_kfolds_four_models.join(predictions_based_on_k_folds_logistic_regression['prediction'].astype(float), how='left').join(predictions_based_on_k_folds_random_forests['prediction'].astype(float), how='left', rsuffix="2").join(predictions_based_on_k_folds_xgboost_gradient_boosting['prediction'].astype(float), how='left', rsuffix="3").join(predictions_based_on_k_folds_light_gbm_gradient_boosting['prediction'].astype(float), how='left', rsuffix="4")
prediction_based_on_kfolds_four_models.columns = ['predsLR', 'predsRF', 'predsXGB', 'predsLightGBM']
X_train_with_predictions = X_train.merge(prediction_based_on_kfolds_four_models, left_index=True, right_index=True)
params_lightGBM = {
'objective':'binary',
'metric':'binary_logloss',
'max_depth':4,
'learning_rate':0.01
}
training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
X_train_fold, X_cv_fold = X_train_with_predictions.iloc[train_index, :], X_train_with_predictions.iloc[cv_index, :]
y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
training_scores.append(log_loss_training)
predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
cv_scores.append(log_loss_cv)
print('Training Log Loss: ', log_loss_training)
print('CV Log Loss: ', log_loss_cv)
log_loss_ensemble = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_ensemble)
preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,'prediction']], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_ensamble = preds.copy()
precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,'prediction']], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_ensamble = preds.copy()
precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])
plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))
new_data = pd.read_cv('')
new_feature_to_scale = new_data.drop(['Time'], axis=1).columns
new_data.loc[:, new_feature_to_scale] = sX.fit_transform(new_data[new_feature_to_scale])
gbm.predict(new_data, num_iteration=gbm.best_iteration)
Recommended Posts