--Refer to [Tutorial] of Scikit learn (http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#example-neighbors-plot-classification-py) --Using a labeled dataset --Change the value and weight of k --Implement k-nearest neighbor method in python
Use the iris dataset. The input data is 4D, but only the first 2D is used for easy visualization.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets, metrics
iris = datasets.load_iris() #4D, with a dataset of 150 samples
#The length and width of the calyx of the plant, the length and width of the petals, and the unit are cm.
iris_X = iris.data[:, :2] #Use only the first 2D features of the 4D
iris_y = iris.target #Correct label, 0, 1,3 types of 2
np.random.seed(0) #Random number seed setting, it doesn't have to be 0
indices = np.random.permutation(len(iris_X)) # 0~Randomly sort 149 numbers
#Divide 140 datasets into 105 teacher data and 35 test data
iris_X_train = iris_X[indices[:-35]]
iris_y_train = iris_y[indices[:-35]]
iris_X_test = iris_X[indices[-35:]]
iris_y_test = iris_y[indices[-35:]]
Create 3 colors according to the number of correct labels. There are two, one for teacher data (cmap_bold) and one for training results (cmap_light).
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
The classifier used this time is KNeighbors Classifier. Find out which label the k data closest to the sample data belongs to, with the variable k being the most important, and classify the labels of the sample data by a majority vote. The default is 5. The other variable "weights" that we will discuss this time has two variables, "uniform" and "distance".
--uniform: Gives uniform weight regardless of the distance between data. This is the default. --distance: Weights are given in inverse proportion to the distance so that the closer the data is, the greater the effect.
There are other variables such as algorithm. For more information, click here [http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html#sklearn.neighbors.RadiusNeighborsClassifier).
h = 0.1 #Mesh size
k_list = [1, 5, 10, 30] #Number of k
weights_list =['uniform', 'distance']
score = np.zeros((len(k_list)*2,5)) # score
Since the input is 2D data, let's display the classification boundary with a color map.
plt.figure(figsize=(8*len(k_list), 12))
i = 1 #For subplot
for weights in weights_list:
for k in k_list:
clf = neighbors.KNeighborsClassifier(k, weights=weights)
clf.fit(iris_X_train, iris_y_train)
x1_min, x1_max = iris_X[:, 0].min() - 1, iris_X[:, 0].max() + 1 #Get the minimum and maximum of the first dimension of X
x2_min, x2_max = iris_X[:, 1].min() - 1, iris_X[:, 1].max() + 1 #Get the minimum and maximum of the second dimension of X
# x1_min to x1_up to max, x2_min to x2_Generates evenly spaced grid arrays in h increments up to max
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
#Prediction for each mesh point/ .ravel()Convert to a one-dimensional array with np.c_[]In xx1,Combine xx2 every xx2
Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
Z = Z.reshape(xx1.shape) #Array format change
plt.subplot(2,len(k_list),i) #2 lines x k_In the i-th graph of the graph in the list column
plt.pcolormesh(xx1, xx2, Z, cmap=cmap_light) #Plot the learning results
plt.scatter(iris_X_train[:, 0], iris_X_train[:, 1], c=iris_y_train, cmap=cmap_bold) #Plot teacher data
plt.scatter(iris_X_test[:, 0], iris_X_test[:, 1], c=iris_y_test, cmap=cmap_light) #Plot test data
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
plt.title("k = %i, weights = '%s'" % (k, weights), fontsize=30)
score[i-1,3] = k
score[i-1,0] = metrics.f1_score(iris_y_test, clf.predict(iris_X_test),average='weighted')
score[i-1,1] = metrics.precision_score(iris_y_test, clf.predict(iris_X_test))
score[i-1,2] = metrics.recall_score(iris_y_test,clf.predict(iris_X_test))
i = i + 1
plt.show
The result looks like this. When the upper part of the graph is uniform and the lower part is distance. The farther to the left, the larger k becomes. It's wonderful to be able to visualize it.
--If k is small, it tends to be overfitting. --distance seems to be more plausible ――But when you look at the f-number, uniform is better. --When k = 1, the result is the same regardless of which weight is used.
Plot the f-number of the test data to practice graphing.
plt.figure(figsize=(10, 4))
i = 0
for weights in weights_list:
plt.subplot(1,2,i+1)
plt.plot(score[i*len(k_list):(i+1)*len(k_list),0])
plt.plot(score[i*len(k_list):(i+1)*len(k_list),1])
plt.plot(score[i*len(k_list):(i+1)*len(k_list),2])
plt.xticks([0,1,2,3],k_list)
plt.ylim(score[:,:3].min()-0.05, 1.05)
plt.title("weights = %s" % weights)
plt.legend(('f1', 'prec', 'recall'), loc='upper right')
plt.xlabel("k_neighbors")
plt.ylabel("f1, prec, recall")
i = i + 1
plt.show
This time, I reduced the number of features to two for visualization, but I would like to see if increasing the number of features can improve accuracy while suppressing overfitting.
Recommended Posts