I recently completed the DataCamp Data Scientist course. Since I was studying in English, I wanted to review it in Japanese, but I was glad that the book called Machine Learning Encyclopedia was easy to understand, so I read it, but thankfully, "Chapter 4 Evaluation Method and Since there was a sutra-copying content that actually practiced with Python code in "Handling various data", I reviewed it.
There are some parts where the code is modified a little on the way, but basically it is almost the same. In order to check the data status that changes with each command, the shape of the data etc. is described in comments as appropriate.
The original sample program is published at the following URL, so please refer to that for the original. Please sign up for Shoeisha's website when downloading.
The chord breaks are divided at the chord level when you actually practice, and multiple topics are mixed.
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer () # Wisconsin Breast Cancer Data
Xo = data.data # Xo.shape = (569, 30)
y = 1 - data.target # reverse target paramater
X = Xo [:,: 10] # mean_ * data is in the first 10 columns
LogisticRegression
from sklearn.linear_model import LogisticRegression
# model_lor = LogisticRegression () #FutureWarning: Default solver will be changed to'lbfgs' in 0.22
model_lor = LogisticRegression (solver ='liblinear') #Warning does not appear when solver is specified.
model_lor.fit(X, y)
y_pred = model_lor.predict(X)
ConfusionMatrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, y_pred)
print("============= CONFUSION MATRIX ============= ")
print(cm)
'''
It comes out in a matrix of the number of cases of prediction and correct answer
Predict
0 1
Actual 0 [341 16]
1 [ 36 176]
'''
Accuracy Score
from sklearn.metrics import accuracy_score
accs = accuracy_score(y, y_pred)
print("============= ACCURACY SCORE ============= ")
print (accs) # Correct answer rate. Correct answer rate for the overall prediction result. (341 + 176) / 569
Precision Score
from sklearn.metrics import precision_score
pres = precision_score(y, y_pred)
print("============= PRECISION SCORE ============= ")
print (pres) # Conformance rate. The ratio of what was predicted to be positive to what was predicted to be positive (1). 176 / (176 + 16)
Recall Score
from sklearn.metrics import recall_score
recs = recall_score(y, y_pred)
print("============= RECALL SCORE ============= ")
print (recs) # recall. Percentage of correct predictions for what is actually positive (1). 176 / (36 + 176)
# If the recall rate is low, it is dangerous because you are missing a positive result.
F Score
from sklearn.metrics import f1_score
f1s = f1_score(y, y_pred)
print("============= F SCORE ============= ")
print (f1s) # An index that reflects trends in both precision and recall.
# Predicted probability
y_pred_proba = model_lor.predict_proba(X)
print(y_pred_proba)
'''
Probability
0 1
Case [4.41813067e-03 9.95581869e-01]
[4.87318129e-04 9.99512682e-01]
[3.31064287e-04 9.99668936e-01]
...
'''
# If you use the standard .predict (), it will be judged by prob> 0.5 (50%), so adjust the parameter if you want to increase the false judgment and reduce the oversight.
# Extract what has a possibility of 10% or more
import numpy as np
y_pred2 = (model_lor.predict_proba(X)[:, 1] > 0.1).astype(np.int)
print("============= CONFUSION MATRIX ( 1 prob > 10% ) ============= ")
print(confusion_matrix(y, y_pred2))
'''
Predict
0 1
Actual 0 [259 98]
1 [ 2 210]
'''
print("============= ACCURACY SCORE ( 1 prob > 10% ) ============= ")
print(accuracy_score(y, y_pred2)) # 0.8242...
print("============= RECALL SCORE ( 1 prob > 10% ) ============= ")
print(recall_score(y, y_pred2)) # 0.9905...
ROC Curve, AUC
ROC : Receiver Operation Characteristic
AUC : Area Under the Curve
from sklearn.metrics import roc_curve
y_pred_proba = model_lor.predict_proba(X)
fpr, tpr, thresholds = roc_curve(y, y_pred_proba[:, 1])
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots () # Returns figure obj and axes obj at once
fig.set_size_inches(4.8, 5)
ax.step(fpr, tpr, 'gray')
ax.fill_between(fpr, tpr, 0, color='skyblue', alpha=0.8)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_facecolor('xkcd:white')
plt.show () # The closer the AUC (area) is to 1, the higher the accuracy.
# Ask for AUC
from sklearn.metrics import roc_auc_score
print("============= AUC(Area Under the Curve) SCORE ( 1 prob > 10% ) ============= ")
print(roc_auc_score(y, y_pred_proba[:, 1])) # 0.9767...
# When dealing with imbalanced data (when the number of Posi and Nega is highly biased), it is better to use AUC as an index.
from sklearn.datasets import load_boston
data = load_boston () # Boston Home Price Data
Xo = data.data # Xo.shape = (506, 13)
X = Xo[:, [5,]] # X.shape = (506, 1) Xo[:, 5][0] => numpy.float64 Xo[:, [5,]][0] => numpy.ndarray
y = data.target
from sklearn.linear_model import LinearRegression
model_lir = LinearRegression()
model_lir.fit(X, y)
y_pred = model_lir.predict(X)
print("============= LINEAR REGRESSION ============= ")
print ("model_lir.coef_: {}". format (model_lir.coef_)) # slope y = ax + b a
print ("model_lir.intercept_: {}". format (model_lir.intercept_)) # b in intercept y = ax + b
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(X, y, color='pink', marker='s', label='data set')
ax.plot(X, y_pred, color='blue', label='regression curve')
ax.legend()
plt.show()
# Mean squared error
# A number that indicates the difference between the actual value and the predicted value. Squared y-axis error and averaged.
from sklearn.metrics import mean_squared_error
print("============= MEAN SQUARED ERROR ============= ")
print(mean_squared_error(y, y_pred)) # 43.600...
# Coefficient of determination (R ** 2)
# A numerical value (0-> 1) that indicates the degree of fit of the prediction of the trained model using the mean square error. 1 is no error. It can be terrible and negative.
from sklearn.metrics import r2_score
print("============= R2 SCORE ============= ")
print(r2_score(y, y_pred))
SVR (Linear Regression)
Support Vector Machine (Kernel method)
from sklearn.svm import SVR
model_svr_linear = SVR(C=0.01, kernel='linear')
model_svr_linear.fit(X, y)
y_svr_pred = model_svr_linear.predict(X)
fig, ax = plt.subplots()
ax.scatter(X, y, color='pink', marker='s', label='data set')
ax.plot(X, y_pred, color='blue', label='regression curve')
ax.plot(X, y_svr_pred, color='red', label='SVR')
ax.legend()
plt.show()
# Verification of SVR (Linear Regression)
print("============= SVR SCORE (LINEAR REGRESSION) ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y, y_svr_pred)))
print("r2_score : {}".format(r2_score(y, y_svr_pred)))
print("model_lir.coef_ : {}".format(model_svr_linear.coef_))
print("model_lir.coef_ : {}".format(model_svr_linear.intercept_))
SVR(rbf)
model_svr_rbf = SVR(C=1.0, kernel='rbf')
model_svr_rbf.fit(X, y)
y_svr_pred = model_svr_rbf.predict(X)
print("============= SVR SCORE (RBF) ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y, y_svr_pred)))
print("r2_score : {}".format(r2_score(y, y_svr_pred)))
Over Fitting
X_train, X_test = X[:400], X[400:]
y_train, y_test = y[:400], y[400:]
model_svr_rbf_1 = SVR(C=1.0, kernel='rbf')
model_svr_rbf_1.fit(X_train, y_train)
y_train_pred = model_svr_rbf_1.predict(X_test)
print("============= SVR SCORE (RBF) - OverFitting ============= ")
print("mean_squared_error : {}".format(mean_squared_error(y_test, y_train_pred)))
print("r2_score : {}".format(r2_score(y_test, y_train_pred)))
Prevent Over Fitting
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer () # Cancer data mentioned above
X = data.data
y = data.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3) #test If you want to extract the same data, specify random_state.
from sklearn.svm import SVC
model_svc = SVC()
model_svc.fit(X_train, y_train)
y_train_pred = model_svc.predict(X_train)
y_test_pred = model_svc.predict(X_test)
from sklearn.metrics import accuracy_score
print("============= ACCURACY SCORE (SVC) ============= ")
print("Train_Accuracy_score : {}".format(accuracy_score(y_train, y_train_pred))) # 1.0
print("Test_Accuracy_score : {}".format(accuracy_score(y_test, y_test_pred))) # 0.60
# Overfitting because the score in the predicted data is lower than the correct answer rate in the learning data.
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train, y_train)
y_train_pred = model_rfc.predict(X_train)
y_test_pred = model_rfc.predict(X_test)
print("============= ACCURACY SCORE (RFC) ============= ")
print("Train_Accuracy_score : {}".format(accuracy_score(y_train, y_train_pred))) # 0.9974...
print("Test_Accuracy_score : {}".format(accuracy_score(y_test, y_test_pred))) # 0.9590...
# By using Random Forest, the score in the forecast data will increase.
Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
print("============= CROSS VALIDATION SCORE ============= ")
print("Cross_Valication_Score x5 Scoring=Accuracy : {}".format(cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')))
print("Cross_Valication_Score x5 Scoring=F1 : {}".format(cross_val_score(model_rfc_1, X, y, cv=cv, scoring='f1')))
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
Xo = data.data
y = 1 - data.target # reverse label
X= Xo[:, :10]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='f1')
grid_search.fit(X, y)
print("============= GRID SEARCH RESULTS ============= ")
print("GridSearch BEST SCORE : {}".format(grid_search.best_score_))
print("GridSearch BEST PARAMS : {}".format(grid_search.best_params_))
tf-idf
tf : Term Frequency
idf : Inverse Document Frequency
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories)
twenty_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_count = count_vect.transform(twenty_test.data)
model = LinearSVC()
model.fit(X_train_counts, twenty_train.target)
predicted = model.predict(X_test_count)
print("============= CountVectorizer ============= ")
print("predicted == twenty_test.target : {}".format(np.mean(predicted == twenty_test.target))) # 0.7423...
tf_vec = TfidfVectorizer()
X_train_tfidf = tf_vec.fit_transform(twenty_train.data)
X_test_tfidf = tf_vec.transform(twenty_test.data)
model = LinearSVC()
model.fit(X_train_tfidf, twenty_train.target)
predicted = model.predict(X_test_tfidf)
print("============= TfidfVectorizer ============= ")
print("predicted == twenty_test.target : {}".format(np.mean(predicted == twenty_test.target))) # 0.8149...
from PIL import Image
import numpy as np
Transform image data to vector data
img = Image.open ('/Users/***/****.jpg'). convert ('L') # Specify the full path of the local image appropriately
width, height = img.size
img_pixels = []
for y in range(height):
for x in range(width):
img_pixels.append (img.getpixel ((x, y))) Get the pixel value at the position specified by # getpixel
print("============= Print Pixel ============= ")
print("len(img_pixels) : {}".format(len(img_pixels)))
print("img_pixels : {}".format(img_pixels)) # [70, 183, 191, 194, 191, 187, 180, 171, 157, 143, ....]
Predict number images
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
digits = datasets.load_digits() # type(digits) => sklearn.utils.Bunch type(digits.images) => numpy.ndarray
n_samples = len(digits.images) # 1797 digits.images.shape => (1797, 8, 8)
data = digits.images.reshape((n_samples, -1)) # digits.images.reshape((n_samples, -1)).shape => (1797, 64)
model = RandomForestClassifier()
model.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
data[:n_samples // 2].shape => (898, 64)
digits.target[:n_samples // 2].shape => (898,)
expected = digits.target[n_samples // 2:]
predicted = model.predict(data[n_samples // 2:])
print(metrics.classification_report(expected, predicted))
I recommend Shoeisha's machine learning pictorial book because it is a very easy-to-understand and good book. (I wanted to read it before the machine learning section of DataCamp ...)
Shoeisha Machine Learning Encyclopedia
Recommended Posts