"What do you think of this guy?"
"Very ... beautiful ..."
A feeling of punch line.
I had the opportunity to scratch the Multilayer Perceptron, so this is a summary article.
This time, I wrote it so that it can be learned with an interface like scikit-learn while making it nice and general.
Almost all parameters such as the number of hidden layers, the number of nodes, the number of learnings, the learning rate, etc. can be adjusted by passing arguments. There is evidence of a challenge in batch normalization, but it has not been successfully normalized using distribution.
This article contains a non-general but concise one, so please refer to it as well.
After all, if you make it so that you can use anything to some extent, it will be complicated. I want to make it more beautiful. The following is the one who trained the multi-layer perceptron to learn the exclusive OR (often seen).
mlp.py
import numpy as np
class MultilayerPerceptron:
def __init__(self, featureNum, rho, classNum, hidden, normalization=True):
self.featureNum = featureNum
self.rho = rho
self.classNum = classNum
self.hidden = hidden
self.normalization = normalization
rc = [self.featureNum]+self.hidden+[self.classNum]
self.W = [np.random.randn(rc[i]+1, rc[i+1]) for i in range(len(hidden)+1)]
self.h = [[] for _ in range(len(self.hidden)+1)]
self.g = [[] for _ in range(len(self.hidden)+1)]
def fit(self, X, y, learn_times=1000, batch_size="full"):
self.X = np.array(X)
one = np.ones((len(self.X), 1))
self.learn_times = learn_times
self.X_train_mean = np.sum(self.X, axis=0)/len(self.X)
self.X_train_sigma = np.var(self.X, axis=0)**0.5
if (self.normalization):
self.X = (self.X - self.X_train_mean)/self.X_train_sigma
self.X = np.hstack((self.X, one))
self.y = np.tile(y, (1, 1))
self.eps = [[] for _ in range(len(self.hidden)+1)]
self.sigmoid = np.vectorize(lambda x: 1.0 / (1.0 + np.exp(-x)))
self.batch_size = len(self.X) if (batch_size == "full") else int(batch_size)
for _ in range(self.learn_times):
self.shuffled_index = np.random.permutation(len(self.X))
self.X_shuffled = self.X[self.shuffled_index]
self.y_shuffled = self.y[self.shuffled_index]
self.X_batch_list = np.array_split(self.X_shuffled, len(self.X)//self.batch_size, 0)
self.y_batch_list = np.array_split(self.y_shuffled, len(self.y)//self.batch_size, 0)
for p in range(len(self.X_batch_list)):
self.X_batchp = self.X_batch_list[p]
self.y_batchp = self.y_batch_list[p]
self.X_batch_mean = np.sum(self.X_batchp, axis=0) / self.batch_size
one_batchp = np.ones((len(self.X_batchp), 1))
#Does not calculate the activation function of the input layer,Calculate output other than input layer
self.h[0] = self.X_batchp @ self.W[0]
self.g[0] = np.hstack((self.sigmoid(self.h[0]), one_batchp))
for j in range(1,len(self.hidden)):
#Since the weights are already generated in the proper shape when the weights are generated,Just perform matrix operations in the forward propagation direction
self.h[j] = self.g[j-1] @ self.W[j]
#When the output is other than the last layer, stick 1s sideways to match the extended weight vector
self.g[j] = np.hstack((self.sigmoid(self.h[j]), one_batchp))
self.h[-1] = self.g[-2] @ self.W[-1]
self.g[-1] = self.sigmoid(self.h[-1])
#The output of the input layer,Equivalent to the activation function as an identity function,Therefore, it is sufficient to branch so that the value of the input layer is used only when calculating the error with the input layer.
self.eps[-1] = np.array((self.g[-1] - self.y_batchp) * self.g[-1]*(1-self.g[-1]))
for j in range(1, len(self.eps)):
#For the matrix product of weights and errors,Take the element product
#Because the last column of the weight vector is bias,From the front layer,You can't see the bias of the previous layer
#So delete the last column
self.eps[-(j+1)] = self.eps[-j] @ self.W[-j].T * self.g[-(j+1)]*(1-self.g[-(j+1)])
self.eps[-(j+1)] = np.delete(self.eps[-(j+1)], -1, axis=1)
self.W[0] -= self.rho * self.X_batchp.T @ self.eps[0] / len(self.X_batchp)
for j in range(1, len(self.hidden)+1):
self.W[j] -= self.rho * self.g[j-1].T @ self.eps[j] / len(self.X_batchp)
def pred(self, X_test):
self.X_test = np.array(X_test)
one = np.ones((len(self.X_test), 1))
if (self.normalization):
self.X_test = (self.X_test - self.X_train_mean)/self.X_train_sigma
self.X_test = np.hstack((self.X_test, one))
self.h[0] = self.X_test @ self.W[0]
self.g[0] = np.hstack((self.sigmoid(self.h[0]), one))
for j in range(1, len(self.hidden)):
self.h[j] = self.g[j-1] @ self.W[j]
self.g[j] = np.hstack((self.sigmoid(self.h[j]), one))
self.h[-1] = self.g[-2] @ self.W[-1]
self.g[-1] = self.sigmoid(self.h[-1])
return np.argmax(self.g[-1], axis=1)
def score(self, X_test, y_test):
self.X_test = np.array(X_test)
self.y_test = np.array(y_test)
self.loss_vector = (np.argmax(np.array(self.y_test),axis=1) == self.pred(self.X_test))
return np.count_nonzero(self.loss_vector)/len(self.X_test)
#Bias (initial weight is 1) is added to neurons in the hidden layer.
mlp = MultilayerPerceptron(featureNum=2, rho=1, classNum=2, hidden=[4, 3])
x = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [[1, 0], [0, 1], [0, 1], [1, 0]]
mlp.fit(x, y, 1000, 2)
print(mlp.pred(x))
print(mlp.score(x, y))
Click here for the version without comments
mlp.py
import numpy as np
class MultilayerPerceptron:
def __init__(self, featureNum, rho, classNum, hidden, normalization=True):
self.featureNum = featureNum
self.rho = rho
self.classNum = classNum
self.hidden = hidden
self.normalization = normalization
rc = [self.featureNum]+self.hidden+[self.classNum]
self.W = [np.random.randn(rc[i]+1, rc[i+1]) for i in range(len(hidden)+1)]
self.h = [[] for _ in range(len(self.hidden)+1)]
self.g = [[] for _ in range(len(self.hidden)+1)]
def fit(self, X, y, learn_times=1000, batch_size="full"):
self.X = np.array(X)
one = np.ones((len(self.X), 1))
self.learn_times = learn_times
self.X_train_mean = np.sum(self.X, axis=0)/len(self.X)
self.X_train_sigma = np.var(self.X, axis=0)**0.5
if (self.normalization):
self.X = (self.X - self.X_train_mean)/self.X_train_sigma
self.X = np.hstack((self.X, one))
self.y = np.tile(y, (1, 1))
self.eps = [[] for _ in range(len(self.hidden)+1)]
self.sigmoid = np.vectorize(lambda x: 1.0 / (1.0 + np.exp(-x)))
self.batch_size = len(self.X) if (batch_size == "full") else int(batch_size)
for _ in range(self.learn_times):
self.shuffled_index = np.random.permutation(len(self.X))
self.X_shuffled = self.X[self.shuffled_index]
self.y_shuffled = self.y[self.shuffled_index]
self.X_batch_list = np.array_split(self.X_shuffled, len(self.X)//self.batch_size, 0)
self.y_batch_list = np.array_split(self.y_shuffled, len(self.y)//self.batch_size, 0)
for p in range(len(self.X_batch_list)):
self.X_batchp = self.X_batch_list[p]
self.y_batchp = self.y_batch_list[p]
self.X_batch_mean = np.sum(self.X_batchp, axis=0) / self.batch_size
one_batchp = np.ones((len(self.X_batchp), 1))
self.h[0] = self.X_batchp @ self.W[0]
self.g[0] = np.hstack((self.sigmoid(self.h[0]), one_batchp))
for j in range(1,len(self.hidden)):
self.h[j] = self.g[j-1] @ self.W[j]
self.g[j] = np.hstack((self.sigmoid(self.h[j]), one_batchp))
self.h[-1] = self.g[-2] @ self.W[-1]
self.g[-1] = self.sigmoid(self.h[-1])
self.eps[-1] = np.array((self.g[-1] - self.y_batchp) * self.g[-1]*(1-self.g[-1]))
for j in range(1, len(self.eps)):
self.eps[-(j+1)] = self.eps[-j] @ self.W[-j].T * self.g[-(j+1)]*(1-self.g[-(j+1)])
self.eps[-(j+1)] = np.delete(self.eps[-(j+1)], -1, axis=1)
self.W[0] -= self.rho * self.X_batchp.T @ self.eps[0] / len(self.X_batchp)
for j in range(1, len(self.hidden)+1):
self.W[j] -= self.rho * self.g[j-1].T @ self.eps[j] / len(self.X_batchp)
def pred(self, X_test):
self.X_test = np.array(X_test)
one = np.ones((len(self.X_test), 1))
if (self.normalization):
self.X_test = (self.X_test - self.X_train_mean)/self.X_train_sigma
self.X_test = np.hstack((self.X_test, one))
self.h[0] = self.X_test @ self.W[0]
self.g[0] = np.hstack((self.sigmoid(self.h[0]), one))
for j in range(1, len(self.hidden)):
self.h[j] = self.g[j-1] @ self.W[j]
self.g[j] = np.hstack((self.sigmoid(self.h[j]), one))
self.h[-1] = self.g[-2] @ self.W[-1]
self.g[-1] = self.sigmoid(self.h[-1])
return np.argmax(self.g[-1], axis=1)
def score(self, X_test, y_test):
self.X_test = np.array(X_test)
self.y_test = np.array(y_test)
self.loss_vector = (np.argmax(np.array(self.y_test),axis=1) == self.pred(self.X_test))
return np.count_nonzero(self.loss_vector)/len(self.X_test)
mlp = MultilayerPerceptron(featureNum=2, rho=1, classNum=2, hidden=[4, 3])
x = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [[1, 0], [0, 1], [0, 1], [1, 0]]
mlp.fit(x, y, 1000, 2)
print(mlp.pred(x))
print(mlp.score(x, y))
Evil glue and momentum, and escapism before the test.