I'm reading a masterpiece, ** "Deep Learning from Zero" **. This time is a memo of Chapter 4. To execute the code, download the entire code from Github and use jupyter notebook in ch04.
At the end of Chapter 4, there is code (train_neuralnet.py
) that trains a two-layer neural network by calculating the gradient of its parameters by numerical differentiation. This time, I'll run this code and then go through the details. However, the code of Github is not the same, but some modifications and additions are made as follows.
** 1) Correspondence to slow execution speed **
Even though it takes a huge amount of time to use numerical differentiation
, if the accuracy display is every 600 iter (every 1 epoch), the second accuracy calculation will be displayed after a few hours. Therefore, in order to see the result in a little over an hour, set the precision display to
every 1 iter, the
execution count from 10000 to 100, and the
learning rate from 0.1 to 1.0`.
** 2) Reduce the number of external call codes to improve visibility **
The feature of Zero work is that the code already explained is called externally as much as possible, and the code displayed at one time is expressed as concisely as possible. However, calling two_layer_net.py
from train_neuralnet.py
and gradient.py
in the common folder
from two_layer_net.py
is very confusing. So, I will add this area to train_neuralnet.py
to improve the outlook.
However, only function.py
in the common folder
is used as external code (sigmoid, sofytmax, cross_entropy_error, etc.).
Let's execute the following code first.
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
from common.functions import * #function in common folder.Set to use all functions in py
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
class TwoLayerNet:
#Parameter initialization
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
#Forward propagation
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
return y
#Loss calculation
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t)
#Precision calculation
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
#Gradient calculation
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient2(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient2(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient2(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient2(loss_W, self.params['b2'])
return grads
#Numerical differentiation
def numerical_gradient2(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x)
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
idx = it.multi_index
tmp_val = x[idx]
x[idx] = tmp_val + h
fxh1 = f(x) # f(x+h)
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
grad[idx] = (fxh1 - fxh2) / (2*h)
x[idx] = tmp_val #Restore the value
it.iternext()
return grad
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
#Instantiate TwoLayerNet
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
#Initial setting
iters_num = 100 #Execution count changed from 10000 to 100
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 1 #Learning rate is 0.1 → 1.Change to 0
train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = 1 #Accuracy display changes from 1 epoch to 1 iter
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
#Gradient calculation
grad = network.numerical_gradient(x_batch, t_batch)
#Parameter update
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
#Accuracy display
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
#display(iter and train_Add loss)
print('[iter='+str(i)+'] '+'train_loss='+str(loss)+', '+'train_acc='+str(train_acc)+', '+'test_acc='+str(test_acc))
#Drawing a graph
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("iter")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
It took 75 minutes to run 100iter on my Macbook air
. How about putting the precision graph that executed 16epoch (= 9600iter) on the text even in the case of numerical differentiation? I think that, aside from that, numerical differentiation takes an enormous amount of time.
#Parameter initialization
def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
This is the part that is executed only once when instantiating a class, and here each parameter is initialized. np.random.randn ()
is the generation of a normal distribution with mean 0 and variance 1, and np.zeros ()
is the generation of a zero matrix. The size of each matrix is as follows.
#Forward propagation
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
y = softmax(a2)
return y
This is the forward propagation part. Read the matrix of weights'W1',' W2' and bias'b1','b2' stored in the dictionary params and propagate it forward by matrix operation.
#Loss calculation
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t)
This is the part that calculates the loss. Find the cross entropy of y and the teacher data t obtained by calling the forward propagation earlier. Cross entropy uses the functions in function.py
in the common folder
. For the time being,
y [np.arange (batch_size), t]
means to slice the corresponding element of y according to the correct label in the order of t.
#Precision calculation
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
This is the precision calculation part. The accuracy is calculated by inferring y from the input data x, extracting the indexes of y and the correct label t, respectively, and dividing the number when the two indexes are the same by the number of data of x.
#Gradient calculation
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient2(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient2(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient2(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient2(loss_W, self.params['b2'])
return grads
This is the gradient calculation part. Use the numerical_gradient2
function that appears later to summarize the gradient calculation results in a dictionary format. The arguments are the loss function expression
that finds the cross entropy based on x and the correct label t, and the parameter specification
.
By the way, the original is grads ['W1'] = numerical_gradient (loss_W, self.params ['W1'])
, is that? Is it recursive use? At first I thought, but it's not.
In the original, TwoLayerNet.py
has from common.gradient import numerical_gradient
at the beginning, and the numerical_gradient function
in gradient.py
in the common folder
is imported, so use it. Here is the one. There are fewer mistakes if you change the name here, so I changed the name to numerical_gradient2
.
#Numerical differentiation
def numerical_gradient2(f, x):
h = 1e-4 # 0.0001
#Zero matrix grad that stores the calculation result of the gradient(Size specified by x)Prepare
grad = np.zeros_like(x)
#Sequentially index the matrix x(Specify rows and columns)To do
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#Continue until you specify all rows and columns in matrix x
while not it.finished:
idx = it.multi_index #to idx(line,Column)Substitute
tmp_val = x[idx] #tmp the value of x specified by idx_Evacuate to val
#Calculate the loss by forward propagation by adding a small number h
x[idx] = tmp_val + h
fxh1 = f(x) # f(x+h)
#Calculate loss by subtracting a small number h and propagating forward
x[idx] = tmp_val - h
fxh2 = f(x) # f(x-h)
#Calculate the gradient of the corresponding index
grad[idx] = (fxh1 - fxh2) / (2*h)
x[idx] = tmp_val #Restore the saved value
it.iternext() #Run the following index
return grad
This is the part that performs numerical differentiation. Simply put, for each of the `parameters, the small value h is added to calculate the forward propagation / loss, and the small value h is subtracted to calculate the forward propagation / loss, and the gradient depends on how the two loss calculations have changed. I have decided. ``
This time, the number of parameters is 784 * 50 = 39,200 for W1, 50 * 10 = 500 for W2, 50 for b1 and 10 for b2, for a total of 39,760. Since forward propagation / loss calculation is performed twice for each parameter, 79,520 forward propagation / loss calculations are performed for one parameter update of the network. The operation is extremely slow.
Now, I have a np.nditer
that I'm not familiar with in the code. Normally, matrix index specification uses for loop for row specification + for loop and double loop for column specification, but this np.nditer
can do it only once.
As I explained earlier, this code is originally in gradient.py
in the common folder
. The original function name is confusing with numerical_gradient
(same as the function name in the gradient calculation), so this time I changed it to numerical_gradient2
.
#Data reading
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
#Instantiate TwoLayerNet
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
#Initial setting
iters_num = 100 #Execution count changed from 10000 to 100
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 1 #Learning rate is 0.1 → 1.Change to 0
train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = 1 #Accuracy display changes from 1 epoch to 1 iter
After loading the data, instantiate the class TwoLayerNet
. Since input_size = 784, hidden_size = 50, output_size = 10, the model and matrix operations are as follows.
for i in range(iters_num):
#Get mini-batch data
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
#Gradient calculation
grad = network.numerical_gradient(x_batch, t_batch)
#Parameter update
for key in ('W1', 'b1', 'W2', 'b2'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)
#Accuracy display
if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
#display(iter and train_Add loss)
print('[iter='+str(i)+'] '+'train_loss='+str(loss)+', '+'train_acc='+str(train_acc)+', '+'test_acc='+str(test_acc))
First, prepare the data for mini-batch learning. np.random.choice (train_size, batch_size)
assigns the result of randomly selecting 100 train data from 60,000 train data (what number was selected) to batch_mask
, and uses it for training data. And get the correct label.
Next, the gradient calculation. Call the numerical_gradient function
of the TwoLayerNet class
and, as explained earlier, find the gradient by numerical differentiation for each parameter.
It then updates the parameters with the calculated gradient, calculates and records the loss, and displays the accuracy.
#Drawing a graph
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("iter")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
After the learning is completed, the accuracy graph will be displayed. This doesn't need any explanation.
Recommended Posts