** How do these differences affect the forecast results? ** ** The case of the classification model of Last time is shown as an example.

** Furthermore, I would like to compare the case of the regression model. ** **
import numpy as np
import pandas as pd
# scikit-learn library
from sklearn.datasets import load_boston #Boston Home Price Dataset
from sklearn.model_selection import train_test_split #Data split utility
from sklearn.neighbors import KNeighborsRegressor # k-NR regression model method
#Visualization library
import matplotlib.pyplot as plt
import seaborn as sns
#Japanese display module of matplotlib
!pip install japanize-matplotlib
import japanize_matplotlib
#Get dataset
boston = load_boston()
#Convert explanatory variables to DataFrame
df = pd.DataFrame(boston.data, columns=boston.feature_names)
#Concatenate objective variables
df = pd.concat([df, pd.DataFrame(boston.target, columns=['MEDV'])], axis=1)
print(df)

#Create a correlation matrix
correlation_matrix = np.corrcoef(df.T)
#Row / column labels
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
#Convert correlation matrix to DataFrame
correlation_df = pd.DataFrame(correlation_matrix, columns = names, index = names)
#Draw heatmap
plt.figure(figsize=(10,8))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm')
corrcoef () function is used for the correlation matrix, but the correlation between variables is calculated by transposing the rows and columns of the passed data with .T. of heatmap () `displays the value for each cell in the figure.
#Extract only 2 variables
df_extraction = df[['RM', 'MEDV']]
#Variable X,set y
X = np.array(df_extraction['RM'])
y = np.array(df_extraction['MEDV'])
X = X.reshape(len(X), 1) #Convert to 2D
y = y.reshape(len(y), 1)
#Data division for training / testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
X_train = X_train.reshape(len(X_train), 1) #Convert to 2D
X_test = X_test.reshape(len(X_test), 1)
y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)
RM and the objective variable MEDV, and divide them into variables X and y for training and testing, respectively.#Variable to store the correct answer rate
train_accuracy = []
test_accuracy = []
for k in range(1,21):
kNR = KNeighborsRegressor(n_neighbors = k) #Instance generation
kNR.fit(X_train, y_train) #Learning
train_accuracy.append(kNR.score(X_train, y_train)) #Training accuracy rate
test_accuracy.append(kNR.score(X_test, y_test)) #Test accuracy rate
#Convert correct answer rate to array
training_accuracy = np.array(train_accuracy)
test_accuracy = np.array(test_accuracy)
#Changes in the accuracy rate of training and tests
plt.figure(figsize=(6, 4))
plt.plot(range(1,21), train_accuracy, label='Training')
plt.plot(range(1,21), test_accuracy, label='test')
plt.xticks(np.arange(0, 21, 1)) #x-axis scale
plt.xlabel('k number')
plt.ylabel('Correct answer rate')
plt.title('Transition of correct answer rate')
plt.grid()
plt.legend()
#Transition of difference in correct answer rate
plt.figure(figsize=(6, 4))
difference = np.abs(train_accuracy - test_accuracy) #Calculate the difference
plt.plot(range(1,21), difference, label='Difference')
plt.xticks(np.arange(0, 21, 1)) #x-axis scale
plt.xlabel('k number')
plt.ylabel('Difference(train - test)')
plt.title('Transition of difference in correct answer rate')
plt.grid()
plt.legend()
plt.show()

#Generate arithmetic progression
t = np.linspace(1, 10, 1000) #Starting value,End value,Element count
#Convert shape to 2D
T = t.reshape(1000, 1)
n_neighbors = 14
plt.figure(figsize=(12,5))
for i, w in enumerate(['uniform', 'distance']):
model = KNeighborsRegressor(n_neighbors, weights=w)
model = model.fit(X, y)
y_ = model.predict(T)
plt.subplot(1, 2, i + 1)
plt.scatter(X, y, color='limegreen', label='data')
plt.plot(T, y_, color='navy', lw=1, label='Predicted value')
plt.legend()
plt.title("weights = '%s'" % (w))
plt.tight_layout()
plt.show()
tight_layout () automatically adjusts the subplot parameters (axis scale, axis label, title range) so that the subplot fits snugly within the area of the graph.
Recommended Posts