This article is an easy-to-understand output of ** Deep Learning from scratch Chapter 7 Learning Techniques **. I was able to understand it myself in the humanities, so I hope you can read it comfortably. Also, I would be more than happy if you could refer to it when studying this book.
Until now, the initial value of the weight of the neural network used the random method to generate random numbers, but that would widen the success of learning.
The initial value of the weight and the learning of the neural network are very closely related, and if the initial value is appropriate, the learning result will be good, and if the initial value is inappropriate, the learning result will be bad.
Therefore, this time, I would like to implement a method of setting an appropriate initial value of weight in a neural network using the sigmoid function.
The initial value of the weight that is most suitable for the neural network using the sigmoid function is the initial value of Xavier.
scale = np.sqrt(1.0 / all_size_list[idx - 1])
scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
The initial value of Xavier can be created by calculating 1 ÷ the number of nodes in the previous layer with a root and multiplying it by a random random number.
Below is a sample neural network that uses the initial values of He and Xavier.
#Initial value application of weight ・ Neural network that implements Weight decay
class MutiLayerNet:
def __init__(self,input_size,hiden_size_list,output_size,
activation='relu',weight_init_std='relu',weight_decay_lambda=0):#weight_decay_The larger the lambda, the stronger
self.input_size = input_size#Number of neurons in the input layer
self.output_size = output_size#Number of neurons in the output layer
self.hiden_size_list = hiden_size_list#Number of neurons in each layer of the middle layer
self.hiden_layer_num = len(hiden_size_list)#Number of layers in the middle layer
self.weight_decay_lambda = weight_decay_lambda#Weight decay strength setting
self.params = {}#Enter parameters
#Weight initialization
self.__init_weight(weight_init_std)
#Layer creation
activation_layer = {'sigmoid': Sigmoid,'relu': Relu}
self.layers = OrderedDict()#Ordered dictionary to save layers
for idx in range(1, self.hiden_layer_num+1):#Repeat for the number of intermediate layers
self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
self.params['b' + str(idx)])
self.layers['Activation_function' + str(idx)] = activation_layer[activation]()#Select Relu function layer
idx = self.hiden_layer_num + 1#Create Affine layer before output layer
self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
self.params['b' + str(idx)])
self.last_layer = SoftmaxWithLoss()#Layer from output layer to loss function
def __init_weight(self, weight_init_std):#Method to initialize weight / bias
all_size_list = [self.input_size] + self.hiden_size_list + [self.output_size]#Retains the number of neurons in all layers
for idx in range(1, len(all_size_list)):
scale = weight_init_std#Enter the number to be multiplied by the random weight
if str(weight_init_std).lower() in ('relu', 'he'):#Create initial value of he when using relu function
scale = np.sqrt(2.0 / all_size_list[idx - 1]) #Recommended initial value when using ReLU
elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):#When using the sigmoid function Create the initial value of xavier
scale = np.sqrt(1.0 / all_size_list[idx - 1]) #Recommended initial value when using sigmoid
self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])#Weight initialization
self.params['b' + str(idx)] = np.zeros(all_size_list[idx])#Bias initialization
def predict(self, x):#Forward propagation processing of neural network
for layer in self.layers.values():
x = layer.forward(x)
return x
def loss(self, x, t):#Forward propagation processing from neural network to loss function + Weight decay processing
y = self.predict(x)
weight_decay = 0
for idx in range(1, self.hiden_layer_num + 2):#In the Weight decay process, the squares of the weights of each layer are summed, and the following process is performed to sum.
W = self.params['W' + str(idx)]
weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)
return self.last_layer.forward(y, t) + weight_decay
def accuracy(self, x, t):#Calculate the correct answer rate
y = self.predict(x)
y = np.argmax(y, axis=1)
if t.ndim != 1 : t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
def numerical_gradient(self, x, t):#Numerical differentiation
loss_W = lambda W: self.loss(x, t)
grads = {}
for idx in range(1, self.hidden_layer_num+2):
grads['W' + str(idx)] = slopeing_grad(loss_W, self.params['W' + str(idx)])
grads['b' + str(idx)] = slopeing_grad(loss_W, self.params['b' + str(idx)])
return grads
def gradient(self, x, t):#Error back propagation method
# forward
self.loss(x, t)
# backward
dout = 1
dout = self.last_layer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
#Gradient recovery
grads = {}
for idx in range(1, self.hiden_layer_num+2):#Also handles weight decay
grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.layers['Affine' + str(idx)].W
grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db
return grads
Recommended Posts