Introduction

Many people will read First Pattern Recognition as an introduction to machine learning. I was one of them, and I thought that I read it once and understood it completely, but when I read it again, I still do not understand anything [^ 1], so I decided to read it while implementing it. This article summarizes the code that reproduces the execution example described in the book in Python (the operating environment of the described code is numpy 1.16.2, pandas 0.24.2). This time is Chapter 4, "Probability Model and Discrimination Function".

Execution example 4.1 Standardization

Standardization is the process of subtracting the mean and dividing by the standard deviation. Below is the code.

--Loading the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

--Reading iris data

from sklearn.datasets import load_iris

iris = load_iris()
data = pd.DataFrame(iris['data'], columns=iris.feature_names)
target = pd.Series(iris['target']).map({0: 'setosa', 1:'versicolor', 2:'virginica'})

--Mean and covariance matrix of iris petal length and width

print('mu = ', np.mean(data[['petal length (cm)', 'petal width (cm)']]))
print('Sigma = ', np.cov(data[['petal length (cm)', 'petal width (cm)']], rowvar=0))

#output=>
# mu = 
#  petal length (cm)    3.758000
# petal width (cm)     1.199333
# dtype: float64
# Sigma = 
#  [[3.11627785 1.2956094 ]
#  [1.2956094  0.58100626]]

--Scatter plot before standardization --Fig. 4.2 (a)

for t in target.unique():
    data_tmp = data[target==t]
    plt.scatter(data_tmp['petal length (cm)'], data_tmp['petal width (cm)'], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm')
plt.xlim([1, 7])
plt.ylim([-1, 4])
plt.show()

--Scatter plot after standardization --Fig. 4.2 (b)

def standardize(X):
    return (X - np.mean(X)) / np.std(X)

std_data = standardize(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
    data_tmp = std_data[target==t]
    plt.scatter(data_tmp['petal length (cm)'], data_tmp['petal width (cm)'], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.show()

Execution example 4.2 Uncorrelated

Rotating over a matrix of eigenvectors, that is uncorrelated. Below is the code.

--Scatter plot after uncorrelation --Fig. 4.4 (b)

def no_correlation(X):
    cov_mat = np.cov(X, rowvar=0)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    return np.dot(X, eig_vecs)

no_corr_data = no_correlation(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
    data_tmp = no_corr_data[target==t]
    plt.scatter(data_tmp[:, 0], data_tmp[:, 1], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([0, 8])
plt.ylim([-3, 3])
plt.show()

In this execution example, the sign of the y component of the eigenvector is reversed from that described in the book, so the scatter plot also has the y-axis direction reversed.

Sigma = np.cov(data[['petal length (cm)', 'petal width (cm)']], rowvar=0)
Lambda, S = np.linalg.eig(Sigma)
print('Lambda = \n', Lambda)
print('S = \n', S)
#output=>
# Lambda = 
#  [3.66123805 0.03604607]
# S = 
#  [[ 0.92177769 -0.38771882]
#  [ 0.38771882  0.92177769]]

--Covariance matrix after uncorrelated

print('Sigma = \n', np.cov(no_corr_data, rowvar=0))
#output=>
# Sigma = 
#  [[3.66123805e+00 6.01365790e-16]
#  [6.01365790e-16 3.60460707e-02]]

Execution example 4.3 Whitening

Whitening is centralized, uncorrelated, and normalizes the standard deviation to 1. Below is the code.

--Scatter plot after whitening --Fig. 4.6 (b)

def whitening(X):
    cov_mat = np.cov(X, rowvar=0)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    diag_sqrt_eig_vals = np.diag(np.sqrt(eig_vals))

    return np.dot(np.dot(X-np.mean(X, axis=0), eig_vecs), np.linalg.inv(diag_sqrt_eig_vals.T))

whitened_data = whitening(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
    data_tmp = whitened_data[target==t]
    plt.scatter(data_tmp[:, 0], data_tmp[:, 1], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([-3, 3])
plt.ylim([-3, 4])
plt.show()

--Covariance matrix after whitening

print('Sigma = \n', np.cov(whitened_data, rowvar=0))
#output=>
# Sigma = 
#  [[1.00000000e+00 1.64411249e-15]
#  [1.64411249e-15 1.00000000e+00]]

Execution example 4.4 Secondary discriminant function / linear discriminant function

A function that identifies a class, assuming that the class conditional probabilities follow a normal distribution. Below is the code.

--Reading Pima Indian data

The data is borrowed from R datasets.

pima_train = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Pima.tr.csv')
pima_test = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Pima.te.csv')

--Scatter plot of training data --Figure 4.8

for t in ['Yes', 'No']:
    pima_tmp = pima_train[pima_train['type']==t]
    plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()

--Implementation of secondary identification function

class QuadraticDiscriminantFunction:
    def __init__(self, threshold=0):
        self._S = None
        self._c = None
        self._F = None
        self._threshold = threshold
    
    def fit(self, X, Y):
        X_pos = X[Y==1]
        X_neg = X[Y==0]

        mu_pos = np.mean(X_pos, axis=0)
        mu_neg = np.mean(X_neg, axis=0)
        Sigma_pos = np.cov(X_pos, rowvar=0)
        Sigma_neg = np.cov(X_neg, rowvar=0)
        p_pos = len(Y[Y==1])/len(Y)
        p_neg = len(Y[Y==0])/len(Y)
        
        self._S = np.linalg.inv(Sigma_pos) - np.linalg.inv(Sigma_neg)
        self._c = np.dot(mu_neg, np.linalg.inv(Sigma_neg)) - np.dot(mu_pos, np.linalg.inv(Sigma_pos))
        self._F = np.dot(np.dot(mu_pos, np.linalg.inv(Sigma_pos)), mu_pos) - np.dot(np.dot(mu_neg, np.linalg.inv(Sigma_neg)), mu_neg)\
                    + np.log(np.linalg.det(Sigma_pos)/np.linalg.det(Sigma_neg))\
                    - 2*np.log(p_pos/p_neg)
    
    def predict(self, X):
        xSx = np.diag(np.dot(np.dot(X, self._S), X.T))
        cx = np.dot(self._c, X.T)
        Y_hat = xSx + 2*cx + self._F
        return np.where(Y_hat < self._threshold, 1, 0)

--Implementation of linear discriminant function

class LinearDiscriminantFunction:
    def __init__(self, threshold=0):
        self._c = None
        self._F = None
        self._threshold = threshold
    
    def fit(self, X, Y):
        X_pos = X[Y==1]
        X_neg = X[Y==0]

        mu_pos = np.mean(X_pos, axis=0)
        mu_neg = np.mean(X_neg, axis=0)
        Sigma_pos = np.cov(X_pos, rowvar=0)
        Sigma_neg = np.cov(X_neg, rowvar=0)
        p_pos = len(Y[Y==1])/len(Y)
        p_neg = len(Y[Y==0])/len(Y)
        Sigma_pool = p_pos*Sigma_pos + p_neg*Sigma_neg
        
        self._c = np.dot(mu_neg, np.linalg.inv(Sigma_pool)) - np.dot(mu_pos, np.linalg.inv(Sigma_pool))
        self._F = np.dot(np.dot(mu_pos, np.linalg.inv(Sigma_pool)), mu_pos) - np.dot(np.dot(mu_neg, np.linalg.inv(Sigma_pool)), mu_neg)\
                    - 2*np.log(p_pos/p_neg)
    
    def predict(self, X):
        cx = np.dot(self._c, X.T)
        Y_hat = 2*cx + self._F
        return np.where(Y_hat < self._threshold, 1, 0)

--Learning and prediction for drawing Bayesian identification boundaries

X_train = pima_train[['glu', 'bmi']].values
Y_train = pima_train['type'].map({'Yes':1, 'No':0})

qdf = QuadraticDiscriminantFunction(threshold=0)
qdf.fit(X_train, Y_train)
ldf = LinearDiscriminantFunction(threshold=0)
ldf.fit(X_train, Y_train)

X1_min, X1_max = np.min(X_train[:, 0])-1, np.max(X_train[:, 0])+1
X2_min, X2_max = np.min(X_train[:, 1])-1, np.max(X_train[:, 1])+1
X1, X2 = np.meshgrid(np.arange(X1_min, X1_max, 1), np.arange(X2_min, X2_max, 0.5))
X = np.c_[X1.ravel(), X2.ravel()]

Y_qdf_pred = qdf.predict(X)
Y_ldf_pred = ldf.predict(X)

--Bayesian discrimination boundary obtained from the secondary discriminant function --Fig. 4.9 (a)

plt.contour(X1, X2, Y_qdf_pred.reshape(X1.shape), colors='limegreen')
for t in ['Yes', 'No']:
    pima_tmp = pima_train[pima_train['type']==t]
    plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()

--Error rate of secondary discriminant function

X_test = pima_test[['glu', 'bmi']].values
Y_test = pima_test['type'].map({'Yes':1, 'No':0})

Y_train_qdf_pred = qdf.predict(X_train)
print('error rate (train) = {:.1f}%'.format(np.mean(Y_train_qdf_pred != Y_train)*100))
Y_test_qdf_pred = qdf.predict(X_test)
print('error rate (test) = {:.1f}%'.format(np.mean(Y_test_qdf_pred != Y_test)*100))

#output=>
# error rate (train) = 24.5%
# error rate (test) = 23.8%

--Bayesian discrimination boundary obtained from linear discriminant function --Fig. 4.9 (b)

plt.contour(X1, X2, Y_ldf_pred.reshape(X1.shape), colors='limegreen')
for t in ['Yes', 'No']:
    pima_tmp = pima_train[pima_train['type']==t]
    plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()

--Error rate of linear discriminant function

Y_train_ldf_pred = ldf.predict(X_train)
print('error rate (train) = {:.1f}%'.format(np.mean(Y_train_ldf_pred != Y_train)*100))
Y_test_ldf_pred = ldf.predict(X_test)
print('error rate (test) = {:.1f}%'.format(np.mean(Y_test_ldf_pred != Y_test)*100))
#output=>
# error rate (train) = 24.0%
# error rate (test) = 22.0%

--ROC curve of quadratic discriminant function / linear discriminant function --Fig. 4.10

qdf_tpr_list = []
qdf_fpr_list = []
ldf_tpr_list = []
ldf_fpr_list = []

for th in np.arange(-9, 7, 0.1):
    qdf = QuadraticDiscriminantFunction(threshold=th)
    qdf.fit(X_train, Y_train)
    Y_qdf_pred = qdf.predict(X_test)

    tpr_qdf = len(Y_test[(Y_test==0)&(Y_qdf_pred==0)]) / len(Y_test[Y_test==0])
    qdf_tpr_list.append(tpr_qdf)
    fpr_qdf = len(Y_test[(Y_test==1)&(Y_qdf_pred==0)]) / len(Y_test[Y_test==1])
    qdf_fpr_list.append(fpr_qdf)
                        
    ldf = LinearDiscriminantFunction(threshold=th)
    ldf.fit(X_train, Y_train)
    Y_ldf_pred = ldf.predict(X_test)
    
    tpr_ldf = len(Y_test[(Y_test==0)&(Y_ldf_pred==0)]) / len(Y_test[Y_test==0])
    ldf_tpr_list.append(tpr_ldf)
    fpr_ldf = len(Y_test[(Y_test==1)&(Y_ldf_pred==0)]) / len(Y_test[Y_test==1])
    ldf_fpr_list.append(fpr_ldf)

plt.plot(qdf_fpr_list, qdf_tpr_list, label='Quadratic')
plt.plot(ldf_fpr_list, ldf_tpr_list, label='Linear')
plt.legend()
plt.xlabel('false positive ratio')
plt.ylabel('true positive ratio')
plt.show()

in conclusion

I tried to reproduce the execution example of Chapter 4 of Hajipata in Python. As for the correctness of the code, the drawn figure roughly matches the figure in the book, so Yoshi! We only check the degree, so if you find any mistakes, we would appreciate it if you could make an edit request.