** How do these differences affect the forecast results? ** ** The case of the classification model of Last time is shown as an example.
** Furthermore, I would like to compare the case of the regression model. ** **
import numpy as np
import pandas as pd
# scikit-learn library
from sklearn.datasets import load_boston #Boston Home Price Dataset
from sklearn.model_selection import train_test_split #Data split utility
from sklearn.neighbors import KNeighborsRegressor # k-NR regression model method
#Visualization library
import matplotlib.pyplot as plt
import seaborn as sns
#Japanese display module of matplotlib
!pip install japanize-matplotlib
import japanize_matplotlib
#Get dataset
boston = load_boston()
#Convert explanatory variables to DataFrame
df = pd.DataFrame(boston.data, columns=boston.feature_names)
#Concatenate objective variables
df = pd.concat([df, pd.DataFrame(boston.target, columns=['MEDV'])], axis=1)
print(df)
#Create a correlation matrix
correlation_matrix = np.corrcoef(df.T)
#Row / column labels
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
#Convert correlation matrix to DataFrame
correlation_df = pd.DataFrame(correlation_matrix, columns = names, index = names)
#Draw heatmap
plt.figure(figsize=(10,8))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm')
corrcoef ()
function is used for the correlation matrix, but the correlation between variables is calculated by transposing the rows and columns of the passed data with .T
. of
heatmap () `displays the value for each cell in the figure.#Extract only 2 variables
df_extraction = df[['RM', 'MEDV']]
#Variable X,set y
X = np.array(df_extraction['RM'])
y = np.array(df_extraction['MEDV'])
X = X.reshape(len(X), 1) #Convert to 2D
y = y.reshape(len(y), 1)
#Data division for training / testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
X_train = X_train.reshape(len(X_train), 1) #Convert to 2D
X_test = X_test.reshape(len(X_test), 1)
y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)
RM
and the objective variable MEDV
, and divide them into variables X and y for training and testing, respectively.#Variable to store the correct answer rate
train_accuracy = []
test_accuracy = []
for k in range(1,21):
kNR = KNeighborsRegressor(n_neighbors = k) #Instance generation
kNR.fit(X_train, y_train) #Learning
train_accuracy.append(kNR.score(X_train, y_train)) #Training accuracy rate
test_accuracy.append(kNR.score(X_test, y_test)) #Test accuracy rate
#Convert correct answer rate to array
training_accuracy = np.array(train_accuracy)
test_accuracy = np.array(test_accuracy)
#Changes in the accuracy rate of training and tests
plt.figure(figsize=(6, 4))
plt.plot(range(1,21), train_accuracy, label='Training')
plt.plot(range(1,21), test_accuracy, label='test')
plt.xticks(np.arange(0, 21, 1)) #x-axis scale
plt.xlabel('k number')
plt.ylabel('Correct answer rate')
plt.title('Transition of correct answer rate')
plt.grid()
plt.legend()
#Transition of difference in correct answer rate
plt.figure(figsize=(6, 4))
difference = np.abs(train_accuracy - test_accuracy) #Calculate the difference
plt.plot(range(1,21), difference, label='Difference')
plt.xticks(np.arange(0, 21, 1)) #x-axis scale
plt.xlabel('k number')
plt.ylabel('Difference(train - test)')
plt.title('Transition of difference in correct answer rate')
plt.grid()
plt.legend()
plt.show()
#Generate arithmetic progression
t = np.linspace(1, 10, 1000) #Starting value,End value,Element count
#Convert shape to 2D
T = t.reshape(1000, 1)
n_neighbors = 14
plt.figure(figsize=(12,5))
for i, w in enumerate(['uniform', 'distance']):
model = KNeighborsRegressor(n_neighbors, weights=w)
model = model.fit(X, y)
y_ = model.predict(T)
plt.subplot(1, 2, i + 1)
plt.scatter(X, y, color='limegreen', label='data')
plt.plot(T, y_, color='navy', lw=1, label='Predicted value')
plt.legend()
plt.title("weights = '%s'" % (w))
plt.tight_layout()
plt.show()
tight_layout ()
automatically adjusts the subplot parameters (axis scale, axis label, title range) so that the subplot fits snugly within the area of the graph.Recommended Posts