Continuing from the last time, SIGNATE Quest was so easy to understand that I signed up for the paid version. The video created by SIGNATE on Udemy was also easy to understand, but SIGNATE Quest was systematized and even easier to understand.
#Import pandas
import pandas as pd
#Data reading
df = pd.read_csv('data.csv', index_col='id')
#Dummy variable data
df = pd.get_dummies(df)
#Explanatory variable data_Set the objective variable to X in data_Substitute for y
data_X = df.drop('y', axis=1)
data_y = df['y']
# train_test_Import split
from sklearn.model_selection import train_test_split
#Divide the data into training data and evaluation data
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.25, random_state=0)
#Displaying the number of rows of explanatory variables for training data
print( train_X.shape[0])
#Display of the number of rows of explanatory variables of evaluation data
print( test_X.shape[0])
# roc_auc_Import score
from sklearn.metrics import roc_auc_score
#Display of AUC calculation results
#To use the evaluation function, the calculation can be performed by giving the variable to which the measured value is assigned and the variable to which the predicted value is assigned.
roc_auc_score(Measured value,Predicted value)
print( roc_auc_score([0,0,1], [0,1,1]) )
#Import pandas
import pandas as pd
#Data reading
df = pd.read_csv('data.csv', index_col='id')
#Dummy variable data
df = pd.get_dummies(df)
#Explanatory variable data_Set the objective variable to X in data_Substitute for y
data_X, data_y = df.drop('y', axis=1), df['y']
# train_test_Import split
from sklearn.model_selection import train_test_split
#Divide the data into training data and evaluation data
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.25, random_state=0)
#Import decision tree model
from sklearn.tree import DecisionTreeClassifier as DT
#Preparation of decision tree model
tree = DT(max_depth = 2, random_state = 0)
#Learning decision tree model
tree.fit(train_X, train_y)
#Display of importance
print( tree.feature_importances_ )
#Name and display importance
print( pd.Series(tree.feature_importances_, index=train_X.columns) )
#Forecasting data for evaluation
pred_y1 = tree.predict_proba(test_X)[:,1]
#Measured value test_y,Predicted value pred_Calculate AUC using y1
auc1 = roc_auc_score(test_y,pred_y1)
#Display of evaluation results
print( auc1 )
#AUC calculation
from sklearn.metrics import roc_auc_score
auc1 = roc_auc_score(test_y, pred_y1)
# roc_import curve
from sklearn.metrics import roc_curve
#Measured value test_y and predicted value pred_Calculation of false positive rate, true positive rate, and threshold using y1
fpr, tpr, thresholds = roc_curve(test_y, pred_y1)
#Creating a label name
roc_label = 'ROC(AUC={:.2}, max_depth=2)'.format(auc1)
#Creating a ROC curve
plt.plot(fpr, tpr, label=roc_label)
#Creating a diagonal
plt.plot([0, 1], [0, 1], color='black', linestyle='dashed')
#Add title to graph
plt.title("ROC")
#Add name to x-axis of graph
plt.xlabel('FPR')
#Add name to y-axis of graph
plt.ylabel('TPR')
#Specifying the x-axis display range
plt.xlim(0, 1)
#Specifying the y-axis display range
plt.ylim(0, 1)
#Displaying the legend
plt.legend()
#Show graph
plt.show()
#Decision tree model(tree)Build
from sklearn.tree import DecisionTreeClassifier as DT
tree = DT(max_depth = 2, random_state = 0)
tree.fit(train_X, train_y)
#Import decision tree drawing library
from sklearn.tree import export_graphviz
#Output of decision tree graph
export_graphviz(tree, out_file="tree.dot", feature_names=train_X.columns, class_names=["0","1"], filled=True, rounded=True)
#Display decision tree graph
from matplotlib import pyplot as plt
from PIL import Image
import pydotplus
import io
g = pydotplus.graph_from_dot_file(path="tree.dot")
gg = g.create_png()
img = io.BytesIO(gg)
img2 = Image.open(img)
plt.figure(figsize=(img2.width/100, img2.height/100), dpi=100)
plt.imshow(img2)
plt.axis("off")
plt.show()
#Import decision tree model
from sklearn.tree import DecisionTreeClassifier as DT
#Import grid search
from sklearn.model_selection import GridSearchCV
#Preparation of decision tree model
tree = DT(random_state=0)
#Parameter preparation
parameters = {'max_depth':[2,3,4,5,6,7,8,9,10]}
#Grid search settings
gcv = GridSearchCV(tree, parameters , cv=5, scoring='roc_auc', return_train_score=True)
#Perform grid search
gcv.fit(train_X, train_y)
#Extraction of evaluation score
train_score = gcv.cv_results_ ['mean_train_score']
test_score = gcv.cv_results_ ['mean_test_score']
# matplotlib.Import pyplot as abbreviated plt
import matplotlib.pyplot as plt
#Drawing the score evaluated using the data used for learning
plt.plot([2,3,4,5,6,7,8,9,10], train_score, label="train_score")
#Drawing of scores evaluated using data not used for learning
plt.plot([2,3,4,5,6,7,8,9,10], test_score, label="test_score")
#Add title to graph
plt.title('train_score vs test_score')
#Add name to x-axis of graph
plt.xlabel('max_depth')
#Add name to y-axis of graph
plt.ylabel('AUC')
#Displaying the legend
plt.legend()
#Graph display
plt.show()
It is a typical overfitting phenomenon that the AUC continues to rise with the data used for training and the AUC falls with the data not used for training.
#Display of optimal parameters
print( gcv.best_params_ )
#Obtaining a model trained with optimal parameters
best_model = gcv.best_estimator_
#Forecasting data for evaluation
pred_y3 = best_model.predict_proba(test_X)[:,1]
#AUC calculation
auc3 = roc_auc_score(test_y, pred_y3)
#Display of AUC
print ( auc3 )
{'max_depth': 6} 0.8631100075115532
If you do print (best_model), the following result will be returned. DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=0, splitter='best')
# matplotlib.Import pyplot
from matplotlib import pyplot as plt
# roc_import curve
from sklearn.metrics import roc_curve
#Calculation of false positive rate, true positive rate, threshold
#It is assumed that the prediction result is assigned to the following variables.
# pred_y1:max_depth=Prediction result in case 2
# pred_y2:max_depth=Forecast result for 10
# pred_y3:max_depth=Prediction result in case of 6
#The variables to which each return value is assigned are as follows.
# fpr1,tpr1,thresholds1:max_depth=False positive rate, true positive rate, threshold in case of 2
# fpr2,tpr2,thresholds2:max_depth=False positive rate, true positive rate, threshold for 10
# fpr3,tpr3,thresholds3:max_depth=False positive rate, true positive rate, threshold for 6
fpr1, tpr1, thresholds1 = roc_curve(test_y, pred_y1)
fpr2, tpr2, thresholds2 = roc_curve(test_y, pred_y2)
fpr3, tpr3, thresholds3 = roc_curve(test_y, pred_y3)
#Creating a label name
#The variables to which each return value is assigned are as follows.
# roc_label1:max_depth=Label name in case of 2
# roc_label2:max_depth=Label name for 10
# roc_label3:max_depth=Label name for 6
roc_label1 = 'ROC(AUC={:.2}, max_depth=2)'.format(auc1)
roc_label2 = 'ROC(AUC={:.2}, max_depth=10)'.format(auc2)
roc_label3 = 'ROC(AUC={:.2}, max_depth=6)'.format(auc3)
#Creating a ROC curve
plt.plot(fpr1, tpr1, label=roc_label1.format(auc1))
plt.plot(fpr2, tpr2, label=roc_label2.format(auc2))
plt.plot(fpr3, tpr3, label=roc_label3.format(auc3))
#Creating a diagonal
plt.plot([0, 1], [0, 1], color='black', linestyle='dashed')
#Add title to graph
plt.title("ROC")
#Add name to x-axis of graph
plt.xlabel('FPR')
#Add name to y-axis of graph
plt.ylabel('TPR')
#Specifying the x-axis display range
plt.xlim(0, 1)
#Specifying the y-axis display range
plt.ylim(0, 1)
#Displaying the legend
plt.legend()
#Show graph
plt.show()
The ROC curve with max_depth = 6 shows the largest area.
Using the decision tree model learned with the optimal parameters, calculate the forecast result of the application rate of the time deposit campaign for the customer assigned to the evaluation data, and create an attack list based on it.
#Creating a customer list that includes application rates
customer_list = pd.DataFrame(index=test_X.index, data={"cvr":pred_y3})
#Calculation of expected earnings
customer_list["return"] = 2000 * customer_list["cvr"]
#Expected ROI calculation
customer_list["ROI"] = customer_list["return"] / 300 * 100
#Sort by ROI in descending order
sorted_customer_list = customer_list.sort_values("ROI", ascending=False)
#ROI is 100%Creating an attack list that cuts out the above customer ids
attack_list = sorted_customer_list[sorted_customer_list["ROI"] >= 100]
#Display of the number of rows and columns in the attack list
print( attack_list.shape )
#Display the first 5 lines of the attack list
print( attack_list.head() )
In this quest, we learned about classification problems, from data preparation to decision tree model creation and parameter tuning by grid search. This analysis flow is basically a very important process that is common to all data analysis.
Recommended Posts