Overview and useful features of scikit-learn that can also be used for deep learning

scikit-learn: A machine learning library for python. There is no construction of deep learning itself, but there is a convenient API for evaluation metrics and hyperparameter search.

Installation

$ pip install scikit-learn

1. Creating a learning model

Instance creation of machine learning model
Learning (fit), hyperparameter determination
Predict, evaluation

`lasso.py`


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# 0.Data read
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test = iris.data[:120], iris.data[120:]
y_train, y_test = iris.target[:120], iris.target[120:]

# 1.Instance creation of machine learning model
model = DecisionTreeClassifier(criterion="entropy")

# 2.Learning(fit), Hyperparameter determination
clf = GridSearchCV(model, {'max_depth': [2, 3, 4, 5, 6]}, verbose=1)
clf.fit(X_train, y_train)
print clf.best_params_, clf.best_score_

# 3.Forecast(predict), Rating
pred = clf.predict(X_test)
print accuracy_score(y_true, y_pred)

2. Evaluation of learning results

Evaluation of precision, recall, f1-score

Useful when the number of class-labels is biased

from sklearn.metrics import classification_report

pred = clf.predict(X_test)
print classification_report(y_test, pred)

#               precision    recall  f1-score   support
# 
#           0       0.94      0.97      0.96        79
#           1       0.90      0.79      0.84        80
#           2       0.99      0.88      0.93        77
#           3       0.89      0.82      0.86        79
#           4       0.94      0.90      0.92        83
#           5       0.92      0.95      0.93        82
#           6       0.95      0.97      0.96        80
#           7       0.96      0.96      0.96        80
#           8       0.82      0.91      0.86        76
#           9       0.79      0.90      0.84        81
# 
# avg / total       0.91      0.91      0.91       797

Confusion matrix output

class-label Useful for evaluating tasks of type 3 or higher

from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pred = clf.predict(X_test)
conf_mat = confusion_matrix(y_test, pred)
print conf_mat

# [[77  0  0  0  0  0  0  0  2  0]
#  [ 0 63  0  2  3  0  0  0  1 11]
#  [ 1  0 68  6  0  0  0  0  0  2]
#  [ 0  2  0 65  0  1  0  2  9  0]
#  [ 2  0  0  0 75  0  2  0  0  4]
#  [ 0  1  0  0  0 78  2  0  0  1]
#  [ 0  1  1  0  0  0 78  0  0  0]
#  [ 0  0  0  0  1  1  0 77  1  0]
#  [ 0  3  0  0  1  2  0  0 69  1]
#  [ 2  0  0  0  0  3  0  1  2 73]]


# seaborn.Plot using heatmap
index = list("0123456789")
columns = list("0123456789")
df = pd.DataFrame(conf_mat, index=index, columns=columns)

fig = plt.figure(figsize = (7,7))
sns.heatmap(df, annot=True, square=True, fmt='.0f', cmap="Blues")
plt.title('hand_written digit classification')
plt.xlabel('ground_truth')
plt.ylabel('prediction')
fig.savefig("conf_mat.png ")

Decision tree plot

import pydotplus

dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,  
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)  
graph.write_png('iris_tree.png')

3. Other

Save and load learning model

import pickle
pickle.dump(clf, open("model.pkl", "wb"))
clf = pickle.load(open("model.pkl", "rb"))

#When using sklearn's joblib (y__See sama's comment)
from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
clf = joblib.load('model.pkl')

Load sample dataset [sklearn.datasets]

from sklearn import datasets

#Data set of 3 varieties of iris (classification)
# 150samples x 4features
iris = datasets.load_iris()

#Data set of handwritten numbers (classification)
# 1794samples x 64features
digits = datasets.load_digits()

#Boston City Home Prices by Region (Return)
# 506samples x 14features
boston = datasets.load_boston()

#Disease progression 1 year after diabetic patients (regression)
# 442samples x 10features
diabetes = datasets.load_diabetes()

#China photo. shape==(427, 640, 3)
im = datasets.load_sample_image('china.jpg')