[Deep Learning from scratch The theory and implementation of deep learning learned from Python](https://www.amazon.co.jp/%E3%82%BC%E3%83%AD%E3%81%8B%E3%] 82% 89% E4% BD% 9C% E3% 82% 8BDeep-Learning-Python% E3% 81% A7% E5% AD% A6% E3% 81% B6% E3% 83% 87% E3% 82% A3% E3% 83% BC% E3% 83% 97% E3% 83% A9% E3% 83% BC% E3% 83% 8B% E3% 83% B3% E3% 82% B0% E3% 81% AE% E7% 90% 86% E8% AB% 96% E3% 81% A8% E5% AE% 9F% E8% A3% 85-% E6% 96% 8E% E8% 97% A4-% E5% BA% B7% E6% AF% 85 / dp / 4873117585 / ref = sr_1_1? s = digital-text & ie = UTF8 & qid = 1483316946 & sr = 8-1 & keywords = deep + learning +% E3% 82% BC% E3% 83% AD% E3% 81% 8B% E3% 82% 89)

sigmoid function

from sigmoid import *

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

sigmoid(7)

0.9990889488055994

x = np.array([4,6,-2,-1, 2])
sigmoid(x)

array([ 0.98201379,  0.99752738,  0.11920292,  0.26894142,  0.88079708])

step function

from step_function import *

def step_function(x):
    return np.array(x > 0, dtype=np.int)

step_function(5)

array(1)

step_function(-5)

array(0)

x = np.array([3,-6,4,-1])
step_function(x)

array([1, 0, 1, 0])

x = np.random.randn(2,3)
print(x)
print(step_function(x))

[[ 0.21780529 -0.05316613  1.28802155]
 [-0.55119659 -1.23515555  0.6576237 ]]
[[1 0 1]
 [0 0 1]]

ReLU function Rectified Linear Unit

A function that outputs the input as it is if the input exceeds 0, and outputs 0 if it is 0 or less.

from relu import *

def relu(x):
    return np.maximum(0, x)

Multidimensional array

B = np.array([[1,2], [3,4], [5,6]]); B

array([[1, 2],
       [3, 4],
       [5, 6]])

np.ndim(B)

B.shape

(3, 2)

A = np.array([[3,2,1], [6,5,4]])

A.dot(B)

array([[14, 20],
       [41, 56]])

A \cdot B= \left( \begin{array}{cc} 3 & 2& 1\\\\ 6 & 5 & 4 \end{array} \right) \cdot \left( \begin{array}{cc} 1 & 2\\\\ 3 & 4\\\\ 5 & 6 \end{array} \right)

Implementation of 3-layer neural network

Signal transmission from the input layer to the first layer

x = np.array([1., .5])
W1 = np.array([[.1, .3, .5], [.2, .4, .6]])
B1 = np.array([.1, .2, .3])
print('Input signal x=', x)
print('Weight W1=', W1)
print('Bias B1=', B1)

Input signal x = [1. 0.5] Weight W1 = [[0.1 0.3 0.5] [ 0.2 0.4 0.6]] Bias B1 = [0.1 0.2 0.3]

x.shape, W1.shape

((2,), (2, 3))

Match the number of elements in the dimensions of x and W1!

A1 = np.dot(x, W1) + B1; A1

array([ 0.3,  0.7,  1.1])

The weighted sum (sum of weighted signal and bias) in the hidden layer is represented by $ a $.

Z1 = sigmoid(A1); Z1

array([ 0.57444252,  0.66818777,  0.75026011])

The signal converted by the activation function is represented by $ z $.

The sigmoid function corresponds to $ h $ () in the figure.

Signal transmission from layer 1 to layer 2

W2 = np.array([[.1, .4], [.2, .5], [.3, .6]])
B2 = np.array([.1, .2])
print('W2=', W2)

print('B2=', B2)

W2= [[ 0.1  0.4]
 [ 0.2  0.5]
 [ 0.3  0.6]]
B2= [ 0.1  0.2]

A2 = np.dot(Z1, W2) + B2; A2

array([ 0.51615984,  1.21402696])

Z2 = sigmoid(A2); Z2

array([ 0.62624937,  0.7710107 ])

Signal transmission from the second layer to the output layer

def identity_function(x):
    """A function called an identity function that outputs the input as it is and does nothing"""
    return x

W3 = np.array([[.1, .3], [.2, .4]])
B3 = np.array([.1, .2])
print('W3=', W3)
print('B3=', B3)

W3= [[ 0.1  0.3]
 [ 0.2  0.4]]
B3= [ 0.1  0.2]

A3 = np.dot(Z2, W3) + B3
Y= identity_function(A3); Y

array([ 0.31682708,  0.69627909])

A function called identity_function () is used as the activation function of the output layer.

The activation function of the output layer is represented by $ \ sigma $ (), which is distinguished from the activation function of the hidden layer $ h $ ().

Implementation summary

def init_network():
    """Weight and bias initialization"""
    network = {}
    network['W1'] = np.array([[.1, .3, .5], [.2, .4, .6]])
    network['W2'] = np.array([[.1, .4], [.2, .5], [.3, .6]])
    network['W3'] = np.array([[.1, .3], [.2, .4]])
    network['b1'] = np.array([.1, .2, .3])
    network['b2'] = np.array([.1, .2])
    network['b3'] = np.array([.1, .2])
    return network

def forward(network, x):
    """The process by which an input signal is converted to an output"""
    W1, W2, W3 = network['W1'], network['W2'], network['W3']
    b1, b2, b3 = network['b1'], network['b2'], network['b3']
    
    #Input layer
    a1 = np.dot(x, W1) + b1
    z1 = sigmoid(a1)
    
    #Hidden layer
    a2 = np.dot(z1, W2) + b2
    z2 = sigmoid(a2)
    
    #Output layer
    a3 = np.dot(z2, W3) + b3   
    return identity_function(a3)

array([ 1. ,  0.5])

network = init_network()
x = np.array([1., .5])
forward(network, x)

array([ 0.31682708,  0.69627909])

3.5 Output layer design

Softmax function

y_k = \frac{\exp{(a_k)}}{\sum_{i=1}^n\exp{(a_i)}}

def softmax(a):
    """Returns the input value a with probability"""
    exp_a = np.exp(a - np.max(a))  #Overflow measures
    return  exp_a/ np.sum(exp_a)

A value that is too large becomes ʻinf` = "infinity" (called ** overflow **) and hits the ceiling, making accurate calculations impossible. Therefore, convert $ e ^ a $ to a smaller value and then divide by the total value so that they are mathematically equivalent as shown below.

\begin{align} y_k = \frac{\exp{(a_k)}}{\sum_{i=1}^n \exp{(a_i)}} &= \frac{C\exp{(a_k)}}{C\sum_{i=1}^n \exp{(a_i)}}\\\\ &= \frac{\exp{(a_k+\log C)}}{\sum_{i=1}^n\exp{(a_i+\log C)}}\\\\ &= \frac{\exp{a_k + C'}}{\sum_{i=1}^n \exp{(a_i + C')}} \end{align}

What is explained above is that no matter what value you add to $ e ^ {a_k} $, the value of $ y_k $ will not change if you add the same denominator.

softmax(np.array([.3, 2.9, 4.]))

array([ 0.01821127,  0.24519181,  0.73659691])

Data set retrieval

import os, sys
sys.path.append(os.pardir)  #Parent directory append
from dataset.mnist import load_mnist

Do the following to download the .gz file of the mnist dataset, unzip it and put it in the pkl file.

It takes a few minutes to finish.

(x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, normalize=False)

Display of MNIST images

ソースはch03.mnist_show.py

The image is displayed in .BMP format.

# %load mnist_show.py
import sys, os
sys.path.append(os.pardir)  #Settings for importing files in the parent directory
import numpy as np
from dataset.mnist import load_mnist
from PIL import Image


def img_show(img):
    pil_img = Image.fromarray(np.uint8(img))
    pil_img.show()

(x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, normalize=False)

img = x_train[0]
label = t_train[0]
print(label)  # 5

print(img.shape)  # (784,)
img = img.reshape(28, 28)  #Transform the shape to the original image size
print(img.shape)  # (28, 28)

img_show(img)

5
(784,)
(28, 28)

Neural network inference processing

# %load neuralnet_mnist.py
import sys, os
sys.path.append(os.pardir)  #Settings for importing files in the parent directory
import numpy as np
import pickle
from dataset.mnist import load_mnist
from common.functions import sigmoid, softmax


def get_data():
    """Loading training data"""
    (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False)
    return x_test, t_test


def init_network():
    """sample_weight.Loading learned weight parameters stored in pkl"""
    with open("sample_weight.pkl", 'rb') as f:
        network = pickle.load(f)
    return network


def predict(network, x):
    """Implementation of neural network
The output layer is not an identity function
It is a softmax function."""
    W1, W2, W3 = network['W1'], network['W2'], network['W3']
    b1, b2, b3 = network['b1'], network['b2'], network['b3']

    a1 = np.dot(x, W1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, W2) + b2
    z2 = sigmoid(a2)
    a3 = np.dot(z2, W3) + b3
    y = softmax(a3)

    return y

%%timeit
x, t = get_data()
network = init_network()
accuracy_cnt = 0
for i in range(len(x)):
    y = predict(network, x[i])
    p= np.argmax(y) #Get the index of the most probable element
    if p == t[i]:
        accuracy_cnt += 1

print("Accuracy:" + str(float(accuracy_cnt) / len(x)))

Accuracy:0.9352
Accuracy:0.9352
Accuracy:0.9352
Accuracy:0.9352
1 loop, best of 3: 1.25 s per loop

Batch processing

Transition of array shape in batch processing

The processing time per sheet can be shortened by reducing the load on the bus bandwidth, that is, by batch processing in which the ratio of operations to data reading is increased.

Reading a large array and calculating a large array at a time finishes the operation faster than calculating a small divided array little by little.

%%timeit
# %load neuralnet_mnist_batch.py
x, t = get_data()
network = init_network()

batch_size = 100 #Number of batches
accuracy_cnt = 0

for i in range(0, len(x), batch_size):
    x_batch = x[i:i+batch_size]
    y_batch = predict(network, x_batch)
    p = np.argmax(y_batch, axis=1)
    accuracy_cnt += np.sum(p == t[i:i+batch_size])

print("Accuracy:" + str(float(accuracy_cnt) / len(x)))

Accuracy:0.9352
Accuracy:0.9352
Accuracy:0.9352
Accuracy:0.9352
1 loop, best of 3: 269 ms per loop

It was able to be executed about 5 times faster by batch processing.

Deep Learning from scratch The theory and implementation of deep learning learned with Python Chapter 3