I'm reading a masterpiece, ** "Deep Learning from Zero" **. This time is the memo of Chapter 5. To execute the code, download the entire code from Github and use jupyter notebook in ch05.
At the end of Chapter 5, there is a code (twoLayerNet.py & train_neuralnet.py) that creates a layer and propagates the error back, but before that, the code that creates a layer and propagates the error is already in Chapter 4. There is (also twoLayerNet.py & train_neuralnet.py), so I will try it from there.
As we did in Chapter 4, to improve the visibility, if we put it together in one code, it is almost the same as the last time, only the --- gradient calculation --- is different. For the time being, I will move it.
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
from common.functions import * #function in common folder.Set to use all functions in py
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
class TwoLayerNet:
#Parameter initialization
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
#Forward propagation
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
return y
#Loss calculation
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t)
#Precision calculation
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# -----------------Gradient calculation-------------------
def gradient(self, x, t):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
grads = {}
batch_num = x.shape[0]
# forward
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
# backward
dy = (y - t) / batch_num
grads['W2'] = np.dot(z1.T, dy)
grads['b2'] = np.sum(dy, axis=0)
dz1 = np.dot(dy, W2.T)
da1 = sigmoid_grad(a1) * dz1
grads['W1'] = np.dot(x.T, da1)
grads['b1'] = np.sum(da1, axis=0)
return grads
# ------------------------------------------------
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
#Instantiate TwoLayerNet
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
#Initial setting
iters_num = 10000 #Number of executions
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
#Gradient calculation
grad = network.gradient(x_batch, t_batch)
#Parameter update
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
#Accuracy display
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
#Drawing a graph
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
Great! Compared to numerical differentiation, the execution speed is as different as heaven and earth. It's super fast! (Or rather, the numerical differentiation is extremely slow)
The contents of the code are explained in Chapter 4 memo except --- Gradient calculation ---, so if you want to see it, [Chapter 4 memo](https://qiita.com/jun40vn/ Please refer to items / be171ff7626d370072d1). In this chapter, only the gradient calculation part is explained.
If you explain a part of the gradient calculation code with a calculation graph, it looks like this.
The sum of grads ['b2'] = np.sum (dy, axis = 0)
is the correspondence to batch processing. Even so, I'm impressed that the seemingly complicated error back propagation is replaced by matrix operations.
First, we derive $ \ frac {\ partial L} {\ partial W2} = z1 ^ {T} * dy $. Depending on the chain rate
Next, we derive $ \ frac {\ partial y} {\ partial Z1} = \ frac {\ partial y} {\ partial a2} W2 ^ T $.
The transition from the world of numerical differentiation to the world of error back propagation by matrix operations is truly innovative, isn't it?
As mentioned earlier, you can get a practical execution speed by writing the matrix operation expression directly in the code for both forward propagation and error back propagation, but writing the code is a little troublesome, isn't it? So there is a way to generate a layer and write the code more simply.
To generate a layer, first import the OrderedDict with from collections import OrderedDict
.
class TwoLayerNet:
def __init__(self, input_size, hidden_size, output_size,
# ................
#Layer generation
self.layers = OrderedDict()
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
self.layers['Relu1'] = Relu()
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
self.lastLayer = SoftmaxWithLoss()
Instantiate OrderedDict with self.layers = OrderedDict ()
during initial setup of the class TwoLayerNet.
OrderedDict is memorized including the order, so if you register ʻAffine1,
Relu1, ʻAffine2
and layer names and processes in the dictionary self.layers
in order, the order will be memorized as well.
Only the final layer has a different error backpropagation formula, so treat it separately as self.lastLayer
.
#Forward propagation
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
#Forward propagation / loss calculation
def loss(self, x, t):
y = self.predict(x)
return self.lastLayer.forward(y, t)
Then, forward propagation predict
just reads the layer names one by one from the dictionary self.layers
and repeats the forward
process, so no matter how many layers there are, this All you need is code.
Forward propagation / loss calculation loss
also propagates forward and only performs forward
processing of the final layer with self.lastLayer.forward (y, t)
, so even if the processing content of the final layer changes, this It remains the code.
#Gradient calculation
def gradient(self, x, t):
#Forward propagation / loss calculation
self.loss(x, t)
#Error back propagation
dout = 1
dout = self.lastLayer.backward(dout) #Substitute the backward result of the last layer into dout
layers = list(self.layers.values()) #Dictionary self.Read the layer name from layers
layers.reverse() #Reverse the order of layer names
for layer in layers: #Read the inverted layer name
dout = layer.backward(dout) #Execute backward of layer name
# ..................
return grads
And the gradient calculation is the same even if the combination of layers changes. This is convenient, isn't it?
Now let's run the code with layer generation.
As before, we'll combine twoLayerNet.py
and train_neuralnet.py
for better code visibility. In addition, the drawing part of the graph is added.
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
from common.layers import * #layers in the common folder.Set to use all functions in py
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from collections import OrderedDict #Import OrderedDict
class TwoLayerNet:
#Initialization
def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
#Parameter initialization
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
#Layer generation
self.layers = OrderedDict()
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
self.layers['Relu1'] = Relu()
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
self.lastLayer = SoftmaxWithLoss()
#Forward propagation
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
#Forward propagation / loss calculation
def loss(self, x, t):
y = self.predict(x)
return self.lastLayer.forward(y, t)
#Precision calculation
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
if t.ndim != 1 : t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
#Gradient calculation
def gradient(self, x, t):
# forward
self.loss(x, t)
# backward
dout = 1
dout = self.lastLayer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
#Setting
grads = {}
grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
return grads
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
#Instantiate TwoLayerNet
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
#Initial setting
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
#Gradient calculation
grad = network.gradient(x_batch, t_batch)
#Parameter update
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
#Accuracy display
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print(train_acc, test_acc)
#Drawing a graph
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
The accuracy is improved by about +3 points compared to the case without layer generation. The reason for this is, of course, not the layer generation. This is because the activation function has changed from sigmoid
to ReLU
.
Recommended Posts