#Forward propagation (single layer / multiple units)
#weight
W = np.array([
[0.1, 0.2, 0.3],
[0.2, 0.3, 0.4],
[0.3, 0.4, 0.5],
[0.4, 0.5, 0.6]
])
##Let's try_Array initialization
#W = np.zeros((4,3))
#W = np.ones((4,3))
#W = np.random.rand(4,3)
#W = np.random.randint(5, size=(4,3))
print_vec("weight", W)
#bias
b = np.array([0.1, 0.2, 0.3])
print_vec("bias", b)
#Input value
x = np.array([1.0, 5.0, 2.0, -1.0])
print_vec("input", x)
#Total input
u = np.dot(x, W) + b
print_vec("Total input", u)
#Intermediate layer output
z = functions.sigmoid(u)
print_vec("Intermediate layer output", z)
The part of the above formula is
u = np.dot(x, W) + b
print_vec("Total input", u)
Where to receive data input Give weight to decide which value to use and how much Both input and weight can be represented by a matrix Bias that shifts the entire input
import numpy as np
from common import functions
"""
# functions.py
import numpy as np
def relu(x):
return np.maximum(0, x)
"""
#Set weights and biases
#Create Natework
def init_network():
print("#####Network initialization#####")
network = {}
network['W1'] = np.array([
[0.1, 0.3, 0.5],
[0.2, 0.4, 0.6]
])
network['W2'] = np.array([
[0.1, 0.4],
[0.2, 0.5],
[0.3, 0.6]
])
network['W3'] = np.array([
[0.1, 0.3],
[0.2, 0.4]
])
network['b1'] = np.array([0.1, 0.2, 0.3])
network['b2'] = np.array([0.1, 0.2])
network['b3'] = np.array([1, 2])
print_vec("Weight 1", network['W1'] )
print_vec("Weight 2", network['W2'] )
print_vec("Weight 3", network['W3'] )
print_vec("Bias 1", network['b1'] )
print_vec("Bias 2", network['b2'] )
print_vec("Bias 3", network['b3'] )
return network
#Create a process
#x: Input value
def forward(network, x):
print("#####Start forward propagation#####")
W1, W2, W3 = network['W1'], network['W2'], network['W3']
b1, b2, b3 = network['b1'], network['b2'], network['b3']
#1 layer total input
u1 = np.dot(x, W1) + b1
#Total output of 1 layer
z1 = functions.relu(u1)
#2 layers total input
u2 = np.dot(z1, W2) + b2
#Total output of 2 layers
z2 = functions.relu(u2)
#Total input of output layer
u3 = np.dot(z2, W3) + b3
#Total output of output layer
y = u3
print_vec("Total input 1", u1)
print_vec("Intermediate layer output 1", z1)
print_vec("Total input 2", u2)
print_vec("Output 1", z1)
print("Output total: " + str(np.sum(z1)))
return y, z1, z2
#Input value
x = np.array([1., 2.])
print_vec("input", x)
#Network initialization
network = init_network()
y, z1, z2 = forward(network, x)
Due to the effect of the activation function, the output of seven minutes is weak, and some are strongly propagated. Propagate features better
import numpy as np
def step(x):
return np.where( x > 0, 1, 0)
--It is a function that fires when the threshold is exceeded, and the output is always 1 or 0. --It cannot express a value between 0 and 1, and can only learn linearly separable ones.
import numpy as np
def sigmoid(x):
return 1/(1+np.exp(-x))
――It is a function that changes slowly between 0 and 1, and it has become possible to convey the strength of the signal to the state where the step function has only ON/OFF, which has triggered the spread of neural networks. --Differentiation is possible with slowly changing functions --Large values can cause a vanishing gradient problem because the change in output is small. Since it can never be 0, computational resources are always consumed.
import numpy as np
def relu(x):
return np.maximum(0,x)
--The most used activation function now --Avoiding the vanishing gradient problem and when it is 0, the output is always 0, so it contributes to sparsification and brings good results.
--Probability of each class
--Compare the output value with the correct answer value and express how well they match
If you add them all together, it will be 0, so square them and add them together. $ \ Frac {1} {2} $ has a coefficient of 1 when differentiated and is easy to calculate.
--Middle layer: Adjust the signal strength before and after the threshold --Output layer; signal magnitude remains the same
--In the case of classification problem, the output layer should be limited to the range of 0 to 1 and the total should be 1.
Regression | Binary classification | Multi-class classification | |
---|---|---|---|
Activation function | Identity map | Sigmoid function | Softmax function |
Activation function(formula) | |||
Error function | Square error | Cross entropy | Cross entropy |
--Square error
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0) #Overflow measures
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
def mean_squared_error(d, y):
return np.mean(np.square(d - y)) / 2
def cross_entropy_error(d, y):
if y.ndim == 1:
d = d.reshape(1, d.size)
y = y.reshape(1, y.size)
#Teacher data is one-hot-In case of vector, convert to index of correct label
if d.size == y.size:
d = d.argmax(axis=1)
batch_size = y.shape[0]
# + 1e-7 is trying not to be 0
return -np.sum(np.log(y[np.arange(batch_size), d] + 1e-7)) / batch_size
y = 3 * x[0] + 2 * x[1]
return y
#Initial setting
def init_network():
# print("#####Network initialization#####")
network = {}
nodesNum = 10
network['W1'] = np.random.randn(2, nodesNum)
network['W2'] = np.random.randn(nodesNum)
network['b1'] = np.random.randn(nodesNum)
network['b2'] = np.random.randn()
# print_vec("Weight 1", network['W1'])
# print_vec("Weight 2", network['W2'])
# print_vec("Bias 1", network['b1'])
# print_vec("Bias 2", network['b2'])
return network
#Forward propagation
def forward(network, x):
# print("#####Start forward propagation#####")
W1, W2 = network['W1'], network['W2']
b1, b2 = network['b1'], network['b2']
u1 = np.dot(x, W1) + b1
z1 = functions.relu(u1)
##Let's try
#z1 = functions.sigmoid(u1)
u2 = np.dot(z1, W2) + b2
y = u2
return z1, y
#Error back propagation
def backward(x, d, z1, y):
# print("\n#####Error back propagation start#####")
grad = {}
W1, W2 = network['W1'], network['W2']
b1, b2 = network['b1'], network['b2']
#Delta at the output layer
delta2 = functions.d_mean_squared_error(d, y)
#Gradient of b2
grad['b2'] = np.sum(delta2, axis=0)
#Gradient of W2
grad['W2'] = np.dot(z1.T, delta2)
#Delta in the middle layer
#delta1 = np.dot(delta2, W2.T) * functions.d_relu(z1)
##Let's try
delta1 = np.dot(delta2, W2.T) * functions.d_sigmoid(z1)
delta1 = delta1[np.newaxis, :]
#Gradient of b1
grad['b1'] = np.sum(delta1, axis=0)
x = x[np.newaxis, :]
#Gradient of W1
grad['W1'] = np.dot(x.T, delta1)
return grad
#Create sample data
data_sets_size = 100000
data_sets = [0 for i in range(data_sets_size)]
for i in range(data_sets_size):
data_sets[i] = {}
#Set a random value
data_sets[i]['x'] = np.random.rand(2)
##Let's try_Input value setting
# data_sets[i]['x'] = np.random.rand(2) * 10 -5 # -Random number from 5 to 5
#Set target output
data_sets[i]['d'] = f(data_sets[i]['x'])
losses = []
#Learning rate
learning_rate = 0.07
#Number of extracts
epoch = 1000
#Parameter initialization
network = init_network()
#Random sampling of data
random_datasets = np.random.choice(data_sets, epoch)
#Repeated gradient descent
for dataset in random_datasets:
x, d = dataset['x'], dataset['d']
z1, y = forward(network, x)
grad = backward(x, d, z1, y)
#Gradient application to parameters
for key in ('W1', 'W2', 'b1', 'b2'):
network[key] -= learning_rate * grad[key]
#error
loss = functions.mean_squared_error(d, y)
losses.append(loss)
print("#####Result display#####")
lists = range(epoch)
plt.plot(lists, losses, '.')
#Graph display
plt.show()
--The purpose of deep learning is to create a network that minimizes errors through learning. --Finding the parameter $ w $ that minimizes the error $ E (w) $
Average error of all samples
for key in ('W1', 'W2', 'b1', 'b2'):
network[key] -= learning_rate * grad[key]
Error of randomly selected sample Reduction of calculation cost when data is redundant. Reduce the risk of converging on unwanted local minimal solutions. You can study online.
Currently common method Randomly divided data set (mini-batch) Average error of samples belonging to $ D_t $ Effective use of computer resources without compromising the advantages of the stochastic gradient descent method
$w^{(t+1)}=w^{(t)}-\epsilon\nabla{E_t} $
How to proceed with learning by updating the parameters each time learning data comes in
Update parameters using all training data at once
A general method of generating minute numbers in a program and calculating the derivative in a pseudo manner
$ \frac{\partial E}{\partial w_m} = \frac{E(w_m+h)-E(w_m-h)}{2h}$
Demerit --The calculation load is heavy because each parameter $ w_m $ is calculated. --There is a lot of waste
Therefore
__ Use the backpropagation method __
The calculated error is differentiated in order from the output layer side and propagated to the layer before the previous layer. Analytical calculation of the derivative at each parameter with the minimum calculation
def backward(x, d, z1, y):
# print("\n#####Error back propagation start#####")
grad = {}
W1, W2 = network['W1'], network['W2']
b1, b2 = network['b1'], network['b2']
#Delta at the output layer
delta2 = functions.d_mean_squared_error(d, y)
#Gradient of b2
grad['b2'] = np.sum(delta2, axis=0)
#Gradient of W2
grad['W2'] = np.dot(z1.T, delta2)
#Delta in the middle layer
#delta1 = np.dot(delta2, W2.T) *
① Input value [X] ② Output value [Y] ③ Weight [W] ④ Bias [b] ⑤ Total input [u] ⑥ Intermediate layer input [z] ⑦ Learning rate [ρ]
The final goal is (2) optimization of output value. It is considered necessary that the output value is optimized even for unknown inputs.
Input layer: 2 nodes 1 layer Middle layer: 3 nodes, 2 layers Output layer: 1 node, 1 layer
u = np.dot(x, W) + b
print_vec("Total input", u)
#Total output of 2 layers
z2 = functions.relu(u2)
z1 = functions.sigmoid(u)
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0) #Overflow measures
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) #Overflow measures
return np.exp(x) / np.sum(np.exp(x))
①def softmax(x): ②np.exp(x) ③np.exp(x), axis=0
if x.ndim == 2:
The inside is for processing at the time of mini batch
x = x - np.max(x, axis=0)
Overflow measures
#### **` return np.exp(x) / np.sum(np.exp(x))`**
Divide by the sum of all one element exp (x)
def cross_entropy_error(d, y):
if y.ndim == 1:
d = d.reshape(1, d.size)
y = y.reshape(1, y.size)
#Teacher data is one-hot-In case of vector, convert to index of correct label
if d.size == y.size:
d = d.argmax(axis=1)
batch_size = y.shape[0]
# + 1e-7 is trying not to be 0
return -np.sum(np.log(y[np.arange(batch_size), d] + 1e-7)) / batch_size
①def cross_entropy_error(d, y): ②return -np.sum(np.log(y[np.arange(batch_size), d] + 1e-7))
np.log(y[np.arange(batch_size), d],1e-7)But$$dlogy$$Represents+ 1e-7は0にならないようにしているlogの中身But0にならないようにしている
#### **`-np.sum(np.log(y[np.arange(batch_size), d] + 1e-7)) / batch_Finding the cross entropy by adding dlogy by size`**
for key in ('W1', 'W2', 'b1', 'b2'):
network[key] -= learning_rate * grad[key]
How to proceed with learning by updating the parameters each time training data enters the model You don't have to prepare the data first, and you can use the data that gathers as you proceed with learning.
The weight is updated by the error for each epoch.
#Error back propagation
def backward(x, d, z1, y):
print("\n#####Error back propagation start#####")
grad = {}
W1, W2 = network['W1'], network['W2']
b1, b2 = network['b1'], network['b2']
#Delta at the output layer
delta2 = functions.d_mean_squared_error(d, y)
#Gradient of b2
grad['b2'] = np.sum(delta2, axis=0)
#Gradient of W2
grad['W2'] = np.dot(z1.T, delta2)
#Delta in the middle layer
#delta1 = np.dot(delta2, W2.T) * functions.d_relu(z1)
##Let's try
delta1 = np.dot(delta2, W2.T) * functions.d_sigmoid(z1)
delta1 = delta1[np.newaxis, :]
#Gradient of b1
grad['b1'] = np.sum(delta1, axis=0)
x = x[np.newaxis, :]
#Gradient of W1
grad['W1'] = np.dot(x.T, delta1)
print_vec("Partial differential_Weight 1", grad["W1"])
print_vec("Partial differential_Weight 2", grad["W2"])
print_vec("Partial differential_Bias 1", grad["b1"])
print_vec("Partial differential_Bias 2", grad["b2"])
return grad
It can be seen that the calculation result delta2
that has already been performed has been added.
①
①
delta1 = np.dot(delta2, W2.T) * functions.d_relu(z1)
②
grad['W1'] = np.dot(x.T, delta1)
https://github.com/Tomo-Horiuchi/rabbit/blob/master/part2/1Day/1_1_forward_propagation.ipynb https://github.com/Tomo-Horiuchi/rabbit/blob/master/part2/1Day/1_2_back_propagation.ipynb https://github.com/Tomo-Horiuchi/rabbit/blob/master/part2/1Day/1_3_stochastic_gradient_descent.ipynb
--I was able to change the shape of the network --I learned how to manipulate arrays with numpy ――I realized that the convergence of NN results changes greatly depending on the shape, network and weight of training data and test data.
Recommended Posts