Basic flow of anomaly detection

1. Pretreatment

Use Kaggle's Credit Card Fraud Detection.

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from sklearn import preprocessing as pp
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
%matplotlib inline

data = pd.read_csv('creditcard.csv')

print(data.shape)
print(data.columns)
print(data.dtypes)

(284807, 31)
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

data.apply(lambda x: len(x.unique()))

Time      124592
V1        275663
V2        275663
V3        275663
V4        275663
V5        275663
V6        275663
V7        275663
V8        275663
V9        275663
V10       275663
V11       275663
V12       275663
V13       275663
V14       275663
V15       275663
V16       275663
V17       275663
V18       275663
V19       275663
V20       275663
V21       275663
V22       275663
V23       275663
V24       275663
V25       275663
V26       275663
V27       275663
V28       275663
Amount     32767
Class          2
dtype: int64

Of the 284807 transaction data, 492 are fraudulent transactions.

data['Class'].sum()

In addition, processing of outliers is omitted this time.

data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Data set split

features_to_scale = data_X.drop(['Time'], axis=1).columns
scaler = pp.StandardScaler(copy=True)
data_X.loc[:, features_to_scale] = scaler.fit_transform(data_X[features_to_scale])

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, stratify=data_y)

k-validated cross-validation

k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)

2. Logistic regression


log_reg = LogisticRegression()
model = log_reg

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=[0,1])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    model.fit(X_train_fold, y_train_fold)
    log_loss_training = log_loss(y_train_fold, model.predict_proba(X_train_fold)[:,1])
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,:] = model.predict_proba(X_cv_fold)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,1])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_logistic_regression = log_loss(y_train, predictions_based_on_k_folds.loc[:,1])
print('Logistic Regression Log Loss: ', log_loss_logistic_regression)

Training Log Loss:  0.005995557191448456
CV Log Loss:  0.005125568292973096
Training Log Loss:  0.006253549879846522
CV Log Loss:  0.00484099351605527
Training Log Loss:  0.005099537613560319
CV Log Loss:  0.007849849024852518
Training Log Loss:  0.006164376210898366
CV Log Loss:  0.004896801432022977
Training Log Loss:  0.005689191528946416
CV Log Loss:  0.0072969772559491235
Logistic Regression Log Loss:  0.006002037904370599

Conformity / recall curve

preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_logistic_regression = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))

auROC (Area under the receiver operating characteristic curve)

fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])

area_under_ROC = auc(fqr, tqr)

plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()

Random Forest

RFC = RandomForestClassifier(n_estimators=10, class_weight='balanced')
model = RFC

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=[0,1])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    model.fit(X_train_fold, y_train_fold)
    log_loss_training = log_loss(y_train_fold, model.predict_proba(X_train_fold)[:,1])
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,:] = model.predict_proba(X_cv_fold)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,1])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_random_forest = log_loss(y_train, predictions_based_on_k_folds.loc[:,1])
print('Random Forest Log Loss: ', log_loss_random_forest)


preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_random_forests = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))


preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,1]], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_random_forests = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))

Training Log Loss:  0.00036571581908744426
CV Log Loss:  0.013690949627129164
Training Log Loss:  0.0004235723689615818
CV Log Loss:  0.00570945955148682
Training Log Loss:  0.00037000075061198505
CV Log Loss:  0.012404725764776376
Training Log Loss:  0.00039448357820150154
CV Log Loss:  0.009696866082135918
Training Log Loss:  0.00039912406259827595
CV Log Loss:  0.008095821155055213
Random Forest Log Loss:  0.009919564436116697

XGBoost

params_xGB = {
    'objective':'binary:logistic',
    'eval_metric':'logloss'
}

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    dtrain = xgb.DMatrix(data=X_train_fold, label=y_train_fold)
    dCV = xgb.DMatrix(data=X_cv_fold)
    
    bst = xgb.cv(params_xGB, dtrain, num_boost_round=2000, nfold=5, early_stopping_rounds=200, verbose_eval=50)
    
    best_rounds = np.argmin(bst['test-logloss-mean'])
    bst = xgb.train(params_xGB, dtrain, best_rounds)
    
    log_loss_training = log_loss(y_train_fold, bst.predict(dtrain))
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = bst.predict(dCV)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index, 'prediction'])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_xgboost_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('XGBoost Gradient Boosting Log Loss: ', log_loss_xgboost_gradient_boosting)

params_xGB = {
    'objective':'binary:logistic',
    'eval_metric':'logloss'
}

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    dtrain = xgb.DMatrix(data=X_train_fold, label=y_train_fold)
    dCV = xgb.DMatrix(data=X_cv_fold)
    
    bst = xgb.cv(params_xGB, dtrain, num_boost_round=2000, nfold=5, early_stopping_rounds=200, verbose_eval=50)
    
    best_rounds = np.argmin(bst['test-logloss-mean'])
    bst = xgb.train(params_xGB, dtrain, best_rounds)
    
    log_loss_training = log_loss(y_train_fold, bst.predict(dtrain))
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = bst.predict(dCV)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index, 'prediction'])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_xgboost_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('XGBoost Gradient Boosting Log Loss: ', log_loss_xgboost_gradient_boosting)


fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])

area_under_ROC = auc(fqr, tqr)

plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()

[0]	train-logloss:0.43780+0.00001	test-logloss:0.43797+0.00002
[50]	train-logloss:0.00012+0.00000	test-logloss:0.00260+0.00027
[100]	train-logloss:0.00005+0.00000	test-logloss:0.00279+0.00026
[150]	train-logloss:0.00004+0.00000	test-logloss:0.00288+0.00026
[200]	train-logloss:0.00003+0.00000	test-logloss:0.00292+0.00027
Training Log Loss:  0.000792206342624254
CV Log Loss:  0.0029518170688098444
[0]	train-logloss:0.43784+0.00002	test-logloss:0.43805+0.00007
[50]	train-logloss:0.00013+0.00000	test-logloss:0.00292+0.00084
[100]	train-logloss:0.00005+0.00000	test-logloss:0.00319+0.00095
[150]	train-logloss:0.00004+0.00000	test-logloss:0.00328+0.00098
[200]	train-logloss:0.00003+0.00000	test-logloss:0.00332+0.00099
Training Log Loss:  0.0005385763160533223
CV Log Loss:  0.001907025524733791
[0]	train-logloss:0.43780+0.00002	test-logloss:0.43798+0.00009
[50]	train-logloss:0.00011+0.00001	test-logloss:0.00278+0.00088
[100]	train-logloss:0.00005+0.00000	test-logloss:0.00299+0.00097
[150]	train-logloss:0.00004+0.00000	test-logloss:0.00308+0.00102
[200]	train-logloss:0.00003+0.00000	test-logloss:0.00314+0.00104
Training Log Loss:  0.0008465584373083585
CV Log Loss:  0.003197665909513871
[0]	train-logloss:0.43785+0.00004	test-logloss:0.43802+0.00006
[50]	train-logloss:0.00012+0.00001	test-logloss:0.00293+0.00057
[100]	train-logloss:0.00005+0.00000	test-logloss:0.00320+0.00063
[150]	train-logloss:0.00004+0.00000	test-logloss:0.00329+0.00065
[200]	train-logloss:0.00003+0.00000	test-logloss:0.00335+0.00067
Training Log Loss:  0.0005723772843934578
CV Log Loss:  0.001984392100999932
[0]	train-logloss:0.43786+0.00003	test-logloss:0.43803+0.00006
[50]	train-logloss:0.00013+0.00001	test-logloss:0.00290+0.00095
[100]	train-logloss:0.00005+0.00000	test-logloss:0.00310+0.00104
[150]	train-logloss:0.00004+0.00000	test-logloss:0.00316+0.00107
[200]	train-logloss:0.00003+0.00000	test-logloss:0.00320+0.00108
Training Log Loss:  0.000810889205186624
CV Log Loss:  0.002501448645341887
XGBoost Gradient Boosting Log Loss:  0.002508469849879866

LightGBM

params_lightGBM = {
    'objective':'binary',
    'metric':'binary_logloss',
    'max_depth':4,
    'learning_rate':0.01
}

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
    gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
    
    log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_lightgbm_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_lightgbm_gradient_boosting)

params_lightGBM = {
    'objective':'binary',
    'metric':'binary_logloss',
    'max_depth':4,
    'learning_rate':0.01
}

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
    gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
    
    log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_lightgbm_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_lightgbm_gradient_boosting)


fqr, tqr, thresholds = roc_curve(preds['true_label'], preds['prediction'])

area_under_ROC = auc(fqr, tqr)

plt.figure()
plt.plot(fqr, tqr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic: Area under the Curve = {0:0.2f}'.format(area_under_ROC))
plt.legend(loc="lower right")
plt.show()

6. Stacking

predictions_test_set_logistic_regression = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
predictions_test_set_logistic_regression.loc[:, 'prediction'] = log_reg.predict_proba(X_test)[:,1]
log_loss_test_set_logistic_regression = log_loss(y_test, predictions_test_set_logistic_regression)

predictions_test_set_random_forests = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])


predictions_test_set_random_forests.loc[:, 'prediction'] = RFC.predict_proba(X_test)[:,1]
log_loss_test_set_random_forests = log_loss(y_test, predictions_test_set_random_forests)

predictions_test_set_xgboost_gradient_boosting = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
dtest=xgb.DMatrix(data=X_test)
predictions_test_set_xgboost_gradient_boosting.loc[:, 'prediction'] = bst.predict(dtest)
log_loss_test_set_xgboost_gradient_boosting = log_loss(y_test, predictions_test_set_xgboost_gradient_boosting)

predictions_test_set_light_gbm_gradient_boosting = pd.DataFrame(data=[], index=y_test.index, columns=['prediction'])
predictions_test_set_light_gbm_gradient_boosting.loc[:, 'prediction'] = gbm.predict(X_test, num_iteration=gbm.best_iteration)
log_loss_test_set_light_gbm_gradient_boosting = log_loss(y_test, predictions_test_set_light_gbm_gradient_boosting)

print('Log Loss of Logistic Regression on Test Set: ', log_loss_test_set_logistic_regression)
print('Log Loss of Random Forests on Test Set: ', log_loss_test_set_random_forests)
print('Log Loss of XGBoost Gradient Boosting on Test Set: ', log_loss_test_set_xgboost_gradient_boosting)
print('Log Loss of LightGBM Gradient Boosting on Test Set: ', log_loss_test_set_light_gbm_gradient_boosting)

Log Loss of Logistic Regression on Test Set:  0.006119615555779465
Log Loss of Random Forests on Test Set:  0.012553984393960552
Log Loss of XGBoost Gradient Boosting on Test Set:  0.003148207871388518
Log Loss of LightGBM Gradient Boosting on Test Set:  0.00333516364607809

prediction_based_on_kfolds_four_models = pd.DataFrame(data=[], index=y_train.index)
prediction_based_on_kfolds_four_models = prediction_based_on_kfolds_four_models.join(predictions_based_on_k_folds_logistic_regression['prediction'].astype(float), how='left').join(predictions_based_on_k_folds_random_forests['prediction'].astype(float), how='left', rsuffix="2").join(predictions_based_on_k_folds_xgboost_gradient_boosting['prediction'].astype(float), how='left', rsuffix="3").join(predictions_based_on_k_folds_light_gbm_gradient_boosting['prediction'].astype(float), how='left', rsuffix="4")
prediction_based_on_kfolds_four_models.columns = ['predsLR', 'predsRF', 'predsXGB', 'predsLightGBM']

X_train_with_predictions = X_train.merge(prediction_based_on_kfolds_four_models, left_index=True, right_index=True)

7. Then, LightGBM again

params_lightGBM = {
    'objective':'binary',
    'metric':'binary_logloss',
    'max_depth':4,
    'learning_rate':0.01
}

training_scores = []
cv_scores = []
predictions_based_on_k_folds = pd.DataFrame(data=[], index=y_train.index, columns=['prediction'])

for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    
    X_train_fold, X_cv_fold = X_train_with_predictions.iloc[train_index, :], X_train_with_predictions.iloc[cv_index, :]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], y_train.iloc[cv_index]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
    gbm = lgb.train(params_lightGBM, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=200)
    
    log_loss_training = log_loss(y_train_fold, gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
    training_scores.append(log_loss_training)
    
    predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'] = gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration)
    log_loss_cv = log_loss(y_cv_fold, predictions_based_on_k_folds.loc[X_cv_fold.index,'prediction'])
    cv_scores.append(log_loss_cv)
    
    print('Training Log Loss: ', log_loss_training)
    print('CV Log Loss: ', log_loss_cv)
    
log_loss_ensemble = log_loss(y_train, predictions_based_on_k_folds.loc[:,'prediction'])
print('lightGBM Gradient Boosting Log Loss: ', log_loss_ensemble)


preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,'prediction']], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_ensamble = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))


preds = pd.concat([y_train, predictions_based_on_k_folds.loc[:,'prediction']], axis=1)
preds.columns = ['true_label', 'prediction']
predictions_based_on_k_folds_ensamble = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['true_label'], preds['prediction'])
average_precision = average_precision_score(preds['true_label'], preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(average_precision))

8. Production system pipeline

new_data = pd.read_cv('')
new_feature_to_scale = new_data.drop(['Time'], axis=1).columns
new_data.loc[:, new_feature_to_scale] = sX.fit_transform(new_data[new_feature_to_scale])
gbm.predict(new_data, num_iteration=gbm.best_iteration)