Your multi-layer perceptron is dirty

"By the way, look at my multi-layer perceptron."

"What do you think of this guy?"

"Very ... beautiful ..."

Main subject

A feeling of punch line.

I had the opportunity to scratch the Multilayer Perceptron, so this is a summary article.

This time, I wrote it so that it can be learned with an interface like scikit-learn while making it nice and general.

Almost all parameters such as the number of hidden layers, the number of nodes, the number of learnings, the learning rate, etc. can be adjusted by passing arguments. There is evidence of a challenge in batch normalization, but it has not been successfully normalized using distribution.

This article contains a non-general but concise one, so please refer to it as well.

After all, if you make it so that you can use anything to some extent, it will be complicated. I want to make it more beautiful. The following is the one who trained the multi-layer perceptron to learn the exclusive OR (often seen).

mlp.py



import numpy as np

class MultilayerPerceptron:
    def __init__(self, featureNum, rho, classNum, hidden, normalization=True):
        self.featureNum = featureNum
        self.rho = rho
        self.classNum = classNum
        self.hidden = hidden
        self.normalization = normalization
        rc = [self.featureNum]+self.hidden+[self.classNum]
        self.W = [np.random.randn(rc[i]+1, rc[i+1]) for i in range(len(hidden)+1)]
        self.h = [[] for _ in range(len(self.hidden)+1)]
        self.g = [[] for _ in range(len(self.hidden)+1)]

    def fit(self, X, y, learn_times=1000, batch_size="full"):
        self.X = np.array(X)
        one = np.ones((len(self.X), 1))
        self.learn_times = learn_times
        self.X_train_mean = np.sum(self.X, axis=0)/len(self.X)
        self.X_train_sigma = np.var(self.X, axis=0)**0.5

        if (self.normalization):
            self.X = (self.X - self.X_train_mean)/self.X_train_sigma
        self.X = np.hstack((self.X, one))
        self.y = np.tile(y, (1, 1))

        self.eps = [[] for _ in range(len(self.hidden)+1)]
        self.sigmoid = np.vectorize(lambda x: 1.0 / (1.0 + np.exp(-x)))
        self.batch_size = len(self.X) if (batch_size == "full") else int(batch_size)

        for _ in range(self.learn_times):

            self.shuffled_index = np.random.permutation(len(self.X))
            self.X_shuffled = self.X[self.shuffled_index]
            self.y_shuffled = self.y[self.shuffled_index]

            self.X_batch_list = np.array_split(self.X_shuffled, len(self.X)//self.batch_size, 0)
            self.y_batch_list = np.array_split(self.y_shuffled, len(self.y)//self.batch_size, 0)

            for p in range(len(self.X_batch_list)):
                self.X_batchp = self.X_batch_list[p]
                self.y_batchp = self.y_batch_list[p]
                self.X_batch_mean = np.sum(self.X_batchp, axis=0) / self.batch_size
                one_batchp = np.ones((len(self.X_batchp), 1))

                #Does not calculate the activation function of the input layer,Calculate output other than input layer
                self.h[0] = self.X_batchp @ self.W[0]
                self.g[0] = np.hstack((self.sigmoid(self.h[0]), one_batchp))
                for j in range(1,len(self.hidden)):
                    #Since the weights are already generated in the proper shape when the weights are generated,Just perform matrix operations in the forward propagation direction
                    self.h[j] = self.g[j-1] @ self.W[j]
                    #When the output is other than the last layer, stick 1s sideways to match the extended weight vector
                    self.g[j] = np.hstack((self.sigmoid(self.h[j]), one_batchp))
                self.h[-1] = self.g[-2] @ self.W[-1]
                self.g[-1] = self.sigmoid(self.h[-1])

                #The output of the input layer,Equivalent to the activation function as an identity function,Therefore, it is sufficient to branch so that the value of the input layer is used only when calculating the error with the input layer.
                self.eps[-1] = np.array((self.g[-1] - self.y_batchp) * self.g[-1]*(1-self.g[-1]))
                for j in range(1, len(self.eps)):
                    #For the matrix product of weights and errors,Take the element product
                    #Because the last column of the weight vector is bias,From the front layer,You can't see the bias of the previous layer
                    #So delete the last column
                    self.eps[-(j+1)] = self.eps[-j] @ self.W[-j].T * self.g[-(j+1)]*(1-self.g[-(j+1)])
                    self.eps[-(j+1)] = np.delete(self.eps[-(j+1)], -1, axis=1)

                self.W[0] -= self.rho * self.X_batchp.T @ self.eps[0] / len(self.X_batchp)
                for j in range(1, len(self.hidden)+1):
                    self.W[j] -= self.rho * self.g[j-1].T @ self.eps[j] / len(self.X_batchp)

    def pred(self, X_test):
        self.X_test = np.array(X_test)
        one = np.ones((len(self.X_test), 1))
        if (self.normalization):
            self.X_test = (self.X_test - self.X_train_mean)/self.X_train_sigma
        self.X_test = np.hstack((self.X_test, one))

        self.h[0] = self.X_test @ self.W[0]
        self.g[0] = np.hstack((self.sigmoid(self.h[0]), one))
        for j in range(1, len(self.hidden)):
            self.h[j] = self.g[j-1] @ self.W[j]
            self.g[j] = np.hstack((self.sigmoid(self.h[j]), one))
        self.h[-1] = self.g[-2] @ self.W[-1]
        self.g[-1] = self.sigmoid(self.h[-1])

        return np.argmax(self.g[-1], axis=1)

    def score(self, X_test, y_test):
        self.X_test = np.array(X_test)
        self.y_test = np.array(y_test)

        self.loss_vector = (np.argmax(np.array(self.y_test),axis=1) == self.pred(self.X_test))

        return np.count_nonzero(self.loss_vector)/len(self.X_test)


#Bias (initial weight is 1) is added to neurons in the hidden layer.
mlp = MultilayerPerceptron(featureNum=2, rho=1, classNum=2, hidden=[4, 3])
x = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [[1, 0], [0, 1], [0, 1], [1, 0]]
mlp.fit(x, y, 1000, 2)
print(mlp.pred(x))
print(mlp.score(x, y))

Click here for the version without comments

mlp.py


import numpy as np

class MultilayerPerceptron:
    def __init__(self, featureNum, rho, classNum, hidden, normalization=True):
        self.featureNum = featureNum
        self.rho = rho
        self.classNum = classNum
        self.hidden = hidden
        self.normalization = normalization
        rc = [self.featureNum]+self.hidden+[self.classNum]
        self.W = [np.random.randn(rc[i]+1, rc[i+1]) for i in range(len(hidden)+1)]
        self.h = [[] for _ in range(len(self.hidden)+1)]
        self.g = [[] for _ in range(len(self.hidden)+1)]

    def fit(self, X, y, learn_times=1000, batch_size="full"):
        self.X = np.array(X)
        one = np.ones((len(self.X), 1))
        self.learn_times = learn_times
        self.X_train_mean = np.sum(self.X, axis=0)/len(self.X)
        self.X_train_sigma = np.var(self.X, axis=0)**0.5

        if (self.normalization):
            self.X = (self.X - self.X_train_mean)/self.X_train_sigma
        self.X = np.hstack((self.X, one))
        self.y = np.tile(y, (1, 1))

        self.eps = [[] for _ in range(len(self.hidden)+1)]
        self.sigmoid = np.vectorize(lambda x: 1.0 / (1.0 + np.exp(-x)))
        self.batch_size = len(self.X) if (batch_size == "full") else int(batch_size)

        for _ in range(self.learn_times):

            self.shuffled_index = np.random.permutation(len(self.X))
            self.X_shuffled = self.X[self.shuffled_index]
            self.y_shuffled = self.y[self.shuffled_index]

            self.X_batch_list = np.array_split(self.X_shuffled, len(self.X)//self.batch_size, 0)
            self.y_batch_list = np.array_split(self.y_shuffled, len(self.y)//self.batch_size, 0)

            for p in range(len(self.X_batch_list)):
                self.X_batchp = self.X_batch_list[p]
                self.y_batchp = self.y_batch_list[p]
                self.X_batch_mean = np.sum(self.X_batchp, axis=0) / self.batch_size
                one_batchp = np.ones((len(self.X_batchp), 1))

                self.h[0] = self.X_batchp @ self.W[0]
                self.g[0] = np.hstack((self.sigmoid(self.h[0]), one_batchp))
                for j in range(1,len(self.hidden)):
                    self.h[j] = self.g[j-1] @ self.W[j]
                    self.g[j] = np.hstack((self.sigmoid(self.h[j]), one_batchp))
                self.h[-1] = self.g[-2] @ self.W[-1]
                self.g[-1] = self.sigmoid(self.h[-1])

                self.eps[-1] = np.array((self.g[-1] - self.y_batchp) * self.g[-1]*(1-self.g[-1]))
                for j in range(1, len(self.eps)):
                    self.eps[-(j+1)] = self.eps[-j] @ self.W[-j].T * self.g[-(j+1)]*(1-self.g[-(j+1)])
                    self.eps[-(j+1)] = np.delete(self.eps[-(j+1)], -1, axis=1)

                self.W[0] -= self.rho * self.X_batchp.T @ self.eps[0] / len(self.X_batchp)
                for j in range(1, len(self.hidden)+1):
                    self.W[j] -= self.rho * self.g[j-1].T @ self.eps[j] / len(self.X_batchp)

    def pred(self, X_test):
        self.X_test = np.array(X_test)
        one = np.ones((len(self.X_test), 1))
        if (self.normalization):
            self.X_test = (self.X_test - self.X_train_mean)/self.X_train_sigma
        self.X_test = np.hstack((self.X_test, one))

        self.h[0] = self.X_test @ self.W[0]
        self.g[0] = np.hstack((self.sigmoid(self.h[0]), one))
        for j in range(1, len(self.hidden)):
            self.h[j] = self.g[j-1] @ self.W[j]
            self.g[j] = np.hstack((self.sigmoid(self.h[j]), one))
        self.h[-1] = self.g[-2] @ self.W[-1]
        self.g[-1] = self.sigmoid(self.h[-1])

        return np.argmax(self.g[-1], axis=1)

    def score(self, X_test, y_test):
        self.X_test = np.array(X_test)
        self.y_test = np.array(y_test)

        self.loss_vector = (np.argmax(np.array(self.y_test),axis=1) == self.pred(self.X_test))

        return np.count_nonzero(self.loss_vector)/len(self.X_test)

mlp = MultilayerPerceptron(featureNum=2, rho=1, classNum=2, hidden=[4, 3])
x = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [[1, 0], [0, 1], [0, 1], [1, 0]]
mlp.fit(x, y, 1000, 2)
print(mlp.pred(x))
print(mlp.score(x, y))


Summary

Evil glue and momentum, and escapism before the test.

Recommended Posts

Your multi-layer perceptron is dirty
Your threading.Event is used incorrectly
What is your "Tanimoto coefficient"?
Multilayer Perceptron with Chainer: Function Fitting
[Chainer] Learning XOR with multi-layer perceptron
Implement the multi-layer perceptron very neatly