It seems that chapters 5 to 7 are important in this book, Describe separately with emphasis
Two Ways to Understand the Backpropagation Method ・ Understanding by "mathematical formula" ・ Understanding by "computational graph"
This book explains the latter
Calculation graph: A graph showing the process of calculation Graph: Graph as a data structure, represented by multiple nodes and edges The figure below is a calculation graph when you purchase an apple for 100 yen each and the consumption tax is 10%. Forward propagation: Propagation from the start point to the end point of the calculation graph Back propagation: Reverse propagation of forward propagation
Computation graph features: Propagating "local calculations" to get the final result In the above figure, only apples were used, but when there are other purchases, the calculation becomes complicated. No matter what the whole thing is doing, you can get the following results only from the information related to you (in the example, an apple).
Advantages of calculation graph: "Differentiation" can be calculated efficiently by propagating in the opposite direction.
It takes time to calculate the gradient of the loss function of the weight parameter of the neural network by numerical differentiation. Therefore, the error back propagation method is performed. Error propagation method: A method for efficiently calculating the gradient of weight parameters
Backpropagation assuming that there is a calculation y = f (x)
z=x+The derivative of y is\\
\frac{\partial z}{\partial x} = 1 \\
\frac{\partial z}{\partial y} = 1
If this is shown in the calculation graph
In code
class AddLayer:
#constructor
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y):
self.x = x
self.y = y
out = x+y
return out
def backward(self, dout):
dx = dout * 1
dy = dout * 1
return dx, dy
z=x*The derivative of y is\\
\frac{\partial z}{\partial x} = y \\
\frac{\partial z}{\partial y} = x
If this is shown in the calculation graph
In code
class MulLayer:
#constructor
# self =Java this
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y):
self.x = x
self.y = y
out = x*y
return out
def backward(self, dout):
dx = dout * self.y
dy = dout * self.x
return dx, dy
y = \left\{
\begin{array}{ll}
x & (x \geq 0) \\
0 & (x \lt 0)
\end{array}
\right.
\frac{\partial y}{\partial x} = \left\{
\begin{array}{ll}
1 & (x \geq 0) \\
0 & (x \lt 0)
\end{array}
\right.
In code
#ReLU layer
class Relu:
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x<=0)
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
sigmoid function y=\frac{1}{1+\exp(-x)} \\
Computational graph supplement
\begin{align}
f(x)&=-x \\
\Rightarrow f'(x)&=-1\\
f(x)&=\exp(x) \\
\Rightarrow f'(x)&=\exp(x)\\
f(x)&=x+1 \\
\Rightarrow f'(x)&=1\\
f(x)&=1/x \\
\Rightarrow f'(x)&=-1/x^2=-f(x)^2\\
\end{align}
\begin{align}
\frac{\partial L}{\partial y}y^2\exp(-x) &=\frac{\partial L}{\partial y}y\frac{\exp(-x)}{1+\exp(-x)} \\
&=\frac{\partial L}{\partial y}y(1-y) \\
\end{align}
In code
#Sigmoid layer
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
out = 1 / (1 + np.exp(-x))
self.out = out
return out
def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out
return dx
In code
#Batch version of Affine layer
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None
def forward(self, x):
self.x = x
out = np.dot(x, self.W) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.dot(dout, axis=0)
return dx
Proof (simple case of N = 1)
\begin{align}
\frac{\partial L}{\partial Y} \cdot W^T&=
\bigr(\begin{matrix}
\frac{\partial L}{\partial y_1} &
\frac{\partial Y}{\partial y_2}&
\frac{\partial Y}{\partial y_2}
\end{matrix}\bigr)
\Biggl(
\begin{matrix}w_{11} & w_{21} \\
w_{12} & w_{22} \\
w_{13} & w_{23}
\end{matrix}\Biggr)\\
&=\bigl(\begin{matrix}
\frac{\partial L}{\partial y_1}w_{11}+
\frac{\partial L}{\partial y_2}w_{12}+
\frac{\partial L}{\partial y_3}w_{13} &
\frac{\partial L}{\partial y_1}w_{21}+
\frac{\partial L}{\partial y_2}w_{22}+
\frac{\partial L}{\partial y_3}w_{23}
\end{matrix}\bigr)\\
&=\bigl(\begin{matrix}
\frac{\partial L}{\partial y_1}\frac{\partial y_1}{\partial x_1}
+\frac{\partial L}{\partial y_2}\frac{\partial y_2}{\partial x_1}
+\frac{\partial L}{\partial y_3}\frac{\partial y_3}{\partial x_1} &
\frac{\partial L}{\partial y_1}\frac{\partial y_1}{\partial x_2}
+\frac{\partial L}{\partial y_2}\frac{\partial y_2}{\partial x_2}
+\frac{\partial L}{\partial y_3}\frac{\partial y_3}{\partial x_2}
\end{matrix}\bigr)\\
&=\bigl(
\begin{matrix}
\frac{\partial L}{\partial Y}\frac{\partial Y}{\partial x_1} &
\frac{\partial L}{\partial Y}\frac{\partial Y}{\partial x_2}
\end{matrix}\bigr)\\
&=\frac{\partial L}{\partial X}\\
X^T \cdot \frac{\partial L}{\partial Y}
&=\Bigl(\begin{matrix}
x_1\\
x_2
\end{matrix}\Bigr)
\cdot
\bigr(\begin{matrix}
\frac{\partial L}{\partial y_1} &
\frac{\partial L}{\partial y_2} &
\frac{\partial L}{\partial y_3}
\end{matrix}\bigr)\\
&=
\bigr(\begin{matrix}
x_1\frac{\partial L}{\partial y_1} &
x_1\frac{\partial L}{\partial y_2} &
x_1\frac{\partial L}{\partial y_3}\\
x_2\frac{\partial L}{\partial y_1} &
x_2\frac{\partial L}{\partial y_2} &
x_2\frac{\partial L}{\partial y_3}
\end{matrix}\bigr)\\
&=
\bigr(\begin{matrix}
\frac{\partial L}{\partial y_1}x_1 &
\frac{\partial L}{\partial y_2}x_1 &
\frac{\partial L}{\partial y_3}x_1\\
\frac{\partial L}{\partial y_1}x_2 &
\frac{\partial L}{\partial y_2}x_2 &
\frac{\partial L}{\partial y_3}x_2
\end{matrix}\bigr)\\
&=
\bigr(\begin{matrix}
\frac{\partial L}{\partial y_1}\frac{\partial y_1}{\partial w_{11}} &
\frac{\partial L}{\partial y_2}\frac{\partial y_2}{\partial w_{12}} &
\frac{\partial L}{\partial y_3}\frac{\partial y_3}{\partial w_{13}}\\
\frac{\partial L}{\partial y_1}\frac{\partial y_1}{\partial w_{21}} &
\frac{\partial L}{\partial y_2}\frac{\partial y_2}{\partial w_{22}} &
\frac{\partial L}{\partial y_3}\frac{\partial y_3}{\partial w_{23}}
\end{matrix}\bigr)\\
&=
\bigr(\begin{matrix}
\frac{\partial L}{\partial w_{11}} &
\frac{\partial L}{\partial w_{12}} &
\frac{\partial L}{\partial w_{13}}\\
\frac{\partial L}{\partial w_{21}} &
\frac{\partial L}{\partial w_{22}} &
\frac{\partial L}{\partial w_{23}}
\end{matrix}\bigr)\\
&=\frac{\partial L}{\partial W}\\
Calculation supplement\\
Y&=X \cdot W+B\\
y_i&=x_1w_{1i}+x_2w_{2i}+b_i\\
Example)\frac{\partial y_3}{\partial w_{23}}&=x_2
\end{align}
\begin{align}
&(y_1, y_2, y_3):Softmax layer output\\
&(t_1, t_2, t_3):Teacher data\\
\\
&In other words(y_1-t_1, y_2-t_2, y_3-t_3)Is\\
&Difference between Softmax layer output and teacher label\\
\end{align}
# SoftmaxWithLoss
class SofmaxWithLoss:
def __init(self):
self.loss = None
self.y = None
self.t = None
def forward(self, x, t):
self.t = t
self.y = sofmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx - (self.y - self.t) / bath_size
return dx
A neural network can be created by simply adding the necessary layers like the Lego block to the above layers. Add some comments while the source is up
# coding: utf-8
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict
class TwoLayerNet:
#-------------------------------------------------
# __init__:Initialize
# @self
# @input_size:Number of neurons in the input layer
# @hidden_size:Number of neurons in the hidden layer
# @output_size:Number of neurons in the output layer
# @weight_init_std:Gaussian distribution scale at weight initialization
#-------------------------------------------------
def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
# params:Dictionary-type variables that hold neural network parameters
#Weight initialization
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
# layer:"Ordered" dictionary variables that hold layers of neural networks
#Layer generation:The point is to save in order
#As a result, it is OK to call the layer as it is for forward propagation and from the reverse for back propagation.
self.layers = OrderedDict()
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
self.layers['Relu1'] = Relu()
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
#Last layer of neural network:Here the SoftmaxWithLoss layer
self.lastLayer = SoftmaxWithLoss()
#-------------------------------------------------
# predict:Perform recognition (reasoning)
# @self
# @x:Image data (input data)
#-------------------------------------------------
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
#-------------------------------------------------
# loss:Find the loss function
# @self
# @x:Image data (input data)
# @t:Teacher data
#-------------------------------------------------
def loss(self, x, t):
y = self.predict(x)
return self.lastLayer.forward(y, t)
#-------------------------------------------------
# accuracy:Find recognition accuracy
# @self
# @x:Image data (input data)
# @t:Teacher data
#-------------------------------------------------
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
if t.ndim != 1 : t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
#-------------------------------------------------
# numerical_gradient:Find the gradient for the weight parameter by numerical differentiation (same as up to Chapter 4)
# @self
# @x:Image data (input data)
# @t:Teacher data
#-------------------------------------------------
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
return grads
#-------------------------------------------------
# gradient:Find the gradient for the weight parameter by the backpropagation method
# @self
# @x:Image data (input data)
# @t:Teacher data
#-------------------------------------------------
def gradient(self, x, t):
#point:I am actually moving the propagation implemented as a layer
# forward:Forward propagation
self.loss(x, t)
# backward:Backpropagation
dout = 1
dout = self.lastLayer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
#Setting
grads = {}
grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
return grads
This source is simply a source to confirm that there is almost no difference in gradient between forward and back propagation.
# coding: utf-8
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical.keys():
diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
print(key + ":" + str(diff))
Execution result in my environment W1:2.61413510374e-13 > 2.610.1^-13 W2:1.04099504538e-12 > 1.040.1^-12 b1:9.1090807423e-13 > 9.10.1^-13 b2:1.20348173094e-10 > 1.20.1^-10
This source is basically a mini-batch that is iteratively trained (updates weights and biases).
# coding: utf-8
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
#Slope
#grad = network.numerical_gradient(x_batch, t_batch)
grad = network.gradient(x_batch, t_batch)
#update
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print(train_acc, test_acc)
Recommended Posts