[Machine learning pictorial book] A memo when performing the Python exercise at the end of the book while checking the data

Overview

I recently completed the DataCamp Data Scientist course. Since I was studying in English, I wanted to review it in Japanese, but I was glad that the book called Machine Learning Encyclopedia was easy to understand, so I read it, but thankfully, "Chapter 4 Evaluation Method and Since there was a sutra-copying content that actually practiced with Python code in "Handling various data", I reviewed it.

There are some parts where the code is modified a little on the way, but basically it is almost the same. In order to check the data status that changes with each command, the shape of the data etc. is described in comments as appropriate.

The original sample program is published at the following URL, so please refer to that for the original. Please sign up for Shoeisha's website when downloading.

Machine Learning Picture Book-Sample Program

The chord breaks are divided at the chord level when you actually practice, and multiple topics are mixed.

Part 1

Evaluation of supervised learning
Evaluation method for classification problems

from sklearn.datasets import load_breast_cancer
 data = load_breast_cancer () # Wisconsin Breast Cancer Data
Xo = data.data                 # Xo.shape = (569, 30)

y = 1 - data.target   # reverse target paramater
 X = Xo [:,: 10] # mean_ * data is in the first 10 columns

 LogisticRegression
from sklearn.linear_model import LogisticRegression
# model_lor = LogisticRegression () #FutureWarning: Default solver will be changed to'lbfgs' in 0.22
 model_lor = LogisticRegression (solver ='liblinear') #Warning does not appear when solver is specified.
model_lor.fit(X, y)
y_pred = model_lor.predict(X)

 ConfusionMatrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, y_pred)
print("============= CONFUSION MATRIX ============= ")
print(cm)
'''
 It comes out in a matrix of the number of cases of prediction and correct answer
          Predict
            0   1
Actual 0 [341  16]
       1 [ 36 176]
'''

 Accuracy Score
from sklearn.metrics import accuracy_score
accs = accuracy_score(y, y_pred)
print("============= ACCURACY SCORE ============= ")
 print (accs) # Correct answer rate. Correct answer rate for the overall prediction result. (341 + 176) / 569

 Precision Score
from sklearn.metrics import precision_score
pres = precision_score(y, y_pred)
print("============= PRECISION SCORE ============= ")
 print (pres) # Conformance rate. The ratio of what was predicted to be positive to what was predicted to be positive (1). 176 / (176 + 16)

 Recall Score
from sklearn.metrics import recall_score
recs = recall_score(y, y_pred)
print("============= RECALL SCORE ============= ")
 print (recs) # recall. Percentage of correct predictions for what is actually positive (1). 176 / (36 + 176)
# If the recall rate is low, it is dangerous because you are missing a positive result.

 F Score
from sklearn.metrics import f1_score
f1s = f1_score(y, y_pred)
print("============= F SCORE ============= ")
 print (f1s) # An index that reflects trends in both precision and recall.

# Predicted probability
y_pred_proba = model_lor.predict_proba(X)
print(y_pred_proba)
'''
       Probability
                   0              1
Case [4.41813067e-03 9.95581869e-01]
     [4.87318129e-04 9.99512682e-01]
     [3.31064287e-04 9.99668936e-01]
     ...
'''
# If you use the standard .predict (), it will be judged by prob> 0.5 (50%), so adjust the parameter if you want to increase the false judgment and reduce the oversight.

# Extract what has a possibility of 10% or more
import numpy as np
y_pred2 = (model_lor.predict_proba(X)[:, 1] > 0.1).astype(np.int)
print("============= CONFUSION MATRIX ( 1 prob > 10% ) ============= ")
print(confusion_matrix(y, y_pred2))
'''
          Predict
            0   1
Actual 0 [259  98]
       1 [  2 210]
'''

print("============= ACCURACY SCORE ( 1 prob > 10% ) ============= ")
print(accuracy_score(y, y_pred2))  # 0.8242...
print("============= RECALL SCORE ( 1 prob > 10% ) ============= ")
print(recall_score(y, y_pred2))    # 0.9905...

 ROC Curve, AUC
 ROC : Receiver Operation Characteristic
 AUC : Area Under the Curve
from sklearn.metrics import roc_curve
y_pred_proba = model_lor.predict_proba(X)
fpr, tpr, thresholds = roc_curve(y, y_pred_proba[:, 1])

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
 fig, ax = plt.subplots () # Returns figure obj and axes obj at once
fig.set_size_inches(4.8, 5)
ax.step(fpr, tpr, 'gray')
ax.fill_between(fpr, tpr, 0, color='skyblue', alpha=0.8)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_facecolor('xkcd:white')
 plt.show () # The closer the AUC (area) is to 1, the higher the accuracy.

# Ask for AUC
from sklearn.metrics import roc_auc_score
print("============= AUC(Area Under the Curve) SCORE ( 1 prob > 10% ) ============= ")
print(roc_auc_score(y, y_pred_proba[:, 1]))   # 0.9767...
# When dealing with imbalanced data (when the number of Posi and Nega is highly biased), it is better to use AUC as an index.

Part 2

Evaluation method for regression problems
Difference between mean square error and coefficient of determination index
Evaluation when using different algorithms
Hyperparameter settings
Model overfitting
How to prevent overfitting
Divided into training data and verification data
Cross-validation

from sklearn.datasets import load_boston
 data = load_boston () # Boston Home Price Data
Xo = data.data            # Xo.shape = (506, 13)
X = Xo[:, [5,]]           # X.shape = (506, 1)   Xo[:, 5][0] => numpy.float64   Xo[:, [5,]][0] => numpy.ndarray
y = data.target

from sklearn.linear_model import LinearRegression
model_lir = LinearRegression()
model_lir.fit(X, y)
y_pred = model_lir.predict(X)
print("============= LINEAR REGRESSION ============= ")
 print ("model_lir.coef_: {}". format (model_lir.coef_)) # slope y = ax + b a
 print ("model_lir.intercept_: {}". format (model_lir.intercept_)) # b in intercept y = ax + b

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(X, y, color='pink', marker='s', label='data set')
ax.plot(X, y_pred, color='blue', label='regression curve')
ax.legend()
plt.show()

# Mean squared error
# A number that indicates the difference between the actual value and the predicted value. Squared y-axis error and averaged.
from sklearn.metrics import mean_squared_error
print("============= MEAN SQUARED ERROR ============= ")
print(mean_squared_error(y, y_pred))   # 43.600...

# Coefficient of determination (R ** 2)
# A numerical value (0-> 1) that indicates the degree of fit of the prediction of the trained model using the mean square error. 1 is no error. It can be terrible and negative.
from sklearn.metrics import r2_score

print("============= R2 SCORE ============= ")
print(r2_score(y, y_pred))

 SVR (Linear Regression)
 Support Vector Machine (Kernel method)
from sklearn.svm import SVR
model_svr_linear = SVR(C=0.01, kernel='linear')
model_svr_linear.fit(X, y)
y_svr_pred = model_svr_linear.predict(X)

fig, ax = plt.subplots()
ax.scatter(X, y, color='pink', marker='s', label='data set')
ax.plot(X, y_pred, color='blue', label='regression curve')
ax.plot(X, y_svr_pred, color='red', label='SVR')
ax.legend()
plt.show()

# Verification of SVR (Linear Regression)
print("============= SVR SCORE (LINEAR REGRESSION) ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y, y_svr_pred)))
print("r2_score           : {}".format(r2_score(y, y_svr_pred)))
print("model_lir.coef_    : {}".format(model_svr_linear.coef_))
print("model_lir.coef_    : {}".format(model_svr_linear.intercept_))

 SVR(rbf)
model_svr_rbf = SVR(C=1.0, kernel='rbf')
model_svr_rbf.fit(X, y)
y_svr_pred = model_svr_rbf.predict(X)
print("============= SVR SCORE (RBF) ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y, y_svr_pred)))
print("r2_score           : {}".format(r2_score(y, y_svr_pred)))

 Over Fitting
X_train, X_test = X[:400], X[400:]
y_train, y_test = y[:400], y[400:]
model_svr_rbf_1 = SVR(C=1.0, kernel='rbf')
model_svr_rbf_1.fit(X_train, y_train)
y_train_pred = model_svr_rbf_1.predict(X_test)
print("============= SVR SCORE (RBF) - OverFitting ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y_test, y_train_pred)))
print("r2_score           : {}".format(r2_score(y_test, y_train_pred)))

 Prevent Over Fitting
from sklearn.datasets import load_breast_cancer
 data = load_breast_cancer () # Cancer data mentioned above
X = data.data
y = data.target

from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3) #test If you want to extract the same data, specify random_state.

from sklearn.svm import SVC
model_svc = SVC()
model_svc.fit(X_train, y_train)
y_train_pred = model_svc.predict(X_train)
y_test_pred = model_svc.predict(X_test)
from sklearn.metrics import accuracy_score

print("============= ACCURACY SCORE (SVC) ============= ")
print("Train_Accuracy_score  : {}".format(accuracy_score(y_train, y_train_pred)))  # 1.0
print("Test_Accuracy_score   : {}".format(accuracy_score(y_test, y_test_pred)))    # 0.60
# Overfitting because the score in the predicted data is lower than the correct answer rate in the learning data.

from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train, y_train)
y_train_pred = model_rfc.predict(X_train)
y_test_pred = model_rfc.predict(X_test)
print("============= ACCURACY SCORE (RFC) ============= ")
print("Train_Accuracy_score  : {}".format(accuracy_score(y_train, y_train_pred)))  # 0.9974...
print("Test_Accuracy_score   : {}".format(accuracy_score(y_test, y_test_pred)))    # 0.9590...
# By using Random Forest, the score in the forecast data will increase.

 Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
print("============= CROSS VALIDATION SCORE ============= ")
print("Cross_Valication_Score x5 Scoring=Accuracy   : {}".format(cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')))
print("Cross_Valication_Score x5 Scoring=F1         : {}".format(cross_val_score(model_rfc_1, X, y, cv=cv, scoring='f1')))

Part 3

Hyperparameter search

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
Xo = data.data
y = 1 - data.target     # reverse label
X= Xo[:, :10]

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
 grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='f1')
grid_search.fit(X, y)
print("============= GRID SEARCH RESULTS ============= ")
print("GridSearch BEST SCORE   : {}".format(grid_search.best_score_))
print("GridSearch BEST PARAMS  : {}".format(grid_search.best_params_))

Part 4

Document data conversion process
Conversion by word count
Conversion by tf-idf
Application to machine learning models

 tf-idf
 tf  : Term Frequency
 idf : Inverse Document Frequency
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories)
twenty_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_count = count_vect.transform(twenty_test.data)

model = LinearSVC()
model.fit(X_train_counts, twenty_train.target)
predicted = model.predict(X_test_count)
print("============= CountVectorizer ============= ")
print("predicted == twenty_test.target  : {}".format(np.mean(predicted == twenty_test.target)))   # 0.7423...

tf_vec = TfidfVectorizer()
X_train_tfidf = tf_vec.fit_transform(twenty_train.data)
X_test_tfidf = tf_vec.transform(twenty_test.data)

model = LinearSVC()
model.fit(X_train_tfidf, twenty_train.target)
predicted = model.predict(X_test_tfidf)
print("============= TfidfVectorizer ============= ")
print("predicted == twenty_test.target  : {}".format(np.mean(predicted == twenty_test.target)))   # 0.8149...

Part 5

Image data conversion process
Use pixel information as it is as a numerical value
Apply the machine learning model with the converted vector data as input

from PIL import Image
import numpy as np

 Transform image data to vector data
 img = Image.open ('/Users/***/****.jpg'). convert ('L') # Specify the full path of the local image appropriately
width, height = img.size
img_pixels = []
for y in range(height):
    for x in range(width):
 img_pixels.append (img.getpixel ((x, y))) Get the pixel value at the position specified by # getpixel

print("============= Print Pixel ============= ")
print("len(img_pixels)  : {}".format(len(img_pixels)))
print("img_pixels  : {}".format(img_pixels))            # [70, 183, 191, 194, 191, 187, 180, 171, 157, 143, ....]

 Predict number images
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

digits = datasets.load_digits()                 # type(digits) => sklearn.utils.Bunch    type(digits.images) => numpy.ndarray

n_samples = len(digits.images)                  # 1797    digits.images.shape => (1797, 8, 8)
data = digits.images.reshape((n_samples, -1))   # digits.images.reshape((n_samples, -1)).shape => (1797, 64)

model = RandomForestClassifier()
model.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
 data[:n_samples // 2].shape => (898, 64)
 digits.target[:n_samples // 2].shape => (898,)

expected = digits.target[n_samples // 2:]
predicted = model.predict(data[n_samples // 2:])

print(metrics.classification_report(expected, predicted))

I recommend Shoeisha's machine learning pictorial book because it is a very easy-to-understand and good book. (I wanted to read it before the machine learning section of DataCamp ...)

Shoeisha Machine Learning Encyclopedia