I'm reading a masterpiece, ** "Deep Learning from Zero 2" **. This time is a memo of Chapter 5. To execute the code, download the entire code from Github and use jupyter notebook in ch05.
2.RNNLM First, try running the following code, ch05 / train_custom_loop.py, which learns the word order of the PTB dataset.
import sys
sys.path.append('..')
import matplotlib.pyplot as plt
import numpy as np
from common.optimizer import SGD
from dataset import ptb
from simple_rnnlm import SimpleRnnlm
#Hyperparameter settings
batch_size = 10
wordvec_size = 100
hidden_size = 100
time_size = 5 #Time size for Truncated BPTT deployment
lr = 0.1
max_epoch = 100
#Read training data (make the data set smaller)
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_size = 1000
corpus = corpus[:corpus_size]
vocab_size = int(max(corpus) + 1)
xs = corpus[:-1] #input
ts = corpus[1:] #Output (teacher label)
data_size = len(xs)
print('corpus size: %d, vocabulary size: %d' % (corpus_size, vocab_size))
#Variables used during learning
max_iters = data_size // (batch_size * time_size)
time_idx = 0
total_loss = 0
loss_count = 0
ppl_list = []
#Model generation
model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
#Calculate the loading start position of each sample in the mini-batch
jump = (corpus_size - 1) // batch_size
offsets = [i * jump for i in range(batch_size)]
for epoch in range(max_epoch):
for iter in range(max_iters):
#Get a mini batch
batch_x = np.empty((batch_size, time_size), dtype='i')
batch_t = np.empty((batch_size, time_size), dtype='i')
for t in range(time_size):
for i, offset in enumerate(offsets):
batch_x[i, t] = xs[(offset + time_idx) % data_size]
batch_t[i, t] = ts[(offset + time_idx) % data_size]
time_idx += 1
#Find the gradient and update the parameters
loss = model.forward(batch_x, batch_t)
model.backward()
optimizer.update(model.params, model.grads)
total_loss += loss
loss_count += 1
#Evaluation of perplexity for each epoch
ppl = np.exp(total_loss / loss_count)
print('| epoch %d | perplexity %.2f'
% (epoch+1, ppl))
ppl_list.append(float(ppl))
total_loss, loss_count = 0, 0
#Drawing a graph
x = np.arange(len(ppl_list))
plt.plot(x, ppl_list, label='train')
plt.xlabel('epochs')
plt.ylabel('perplexity')
plt.show()
The vertical axis of the graph is an index called ** perplexity ** that predicts the probability of the next word, and $ perplexity = e ^ L \ (L =-\ frac {1} {N} \ sum_n \ sum_k t_ { It is expressed by the formula nk} log y_ {nk}) $. The closer the perplexity value is to 1, the higher the prediction accuracy. Simply put, perplexity is the number of choices for the next word. Now, let's take a quick look at the part that prepares the data.
** corpus ** uses only the first 1,000 words of the PTB dataset, and ** training data xs ** and ** teacher data ts ** get 999 words each in one word.
Then, use ʻoffsets to determine the reading positions (10 locations) for the batch size, and create a mini-batch by dividing each data (5) for the time size. When ʻoffsets + time_idx
becomes 999 or more of the data size, it starts from 0 again and the data is acquired.
Let's take a look at ** class SimpleRnnlm **, which is generating the model.
3.SimpleRnnlm
class SimpleRnnlm:
def __init__(self, vocab_size, wordvec_size, hidden_size):
V, D, H = vocab_size, wordvec_size, hidden_size
rn = np.random.randn
#Weight initialization
embed_W = (rn(V, D) / 100).astype('f')
rnn_Wx = (rn(D, H) / np.sqrt(D)).astype('f')
rnn_Wh = (rn(H, H) / np.sqrt(H)).astype('f')
rnn_b = np.zeros(H).astype('f')
affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
affine_b = np.zeros(V).astype('f')
#Layer generation
self.layers = [
TimeEmbedding(embed_W),
TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True),
TimeAffine(affine_W, affine_b)
]
self.loss_layer = TimeSoftmaxWithLoss()
self.rnn_layer = self.layers[1]
#List all weights and gradients
self.params, self.grads = [], []
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
def forward(self, xs, ts):
for layer in self.layers:
xs = layer.forward(xs)
loss = self.loss_layer.forward(xs, ts)
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
for layer in reversed(self.layers):
dout = layer.backward(dout)
return dout
def reset_state(self):
self.rnn_layer.reset_state()
** class SimpleRnnlm ** is a stack of four ** Time layers **: ** TimeEmbedding, Time RNN, Time Affine, Time Softmax With Loss **. Let's take a look at the Time layer in order.
class TimeEmbedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.layers = None
self.W = W
def forward(self, xs):
N, T = xs.shape
V, D = self.W.shape
out = np.empty((N, T, D), dtype='f')
self.layers = []
for t in range(T):
layer = Embedding(self.W)
out[:, t, :] = layer.forward(xs[:, t])
self.layers.append(layer)
return out
def backward(self, dout):
N, T, D = dout.shape
grad = 0
for t in range(T):
layer = self.layers[t]
layer.backward(dout[:, t, :])
grad += layer.grads[0]
self.grads[0][...] = grad
return None
The ** Time Embedding layer ** cuts out data column by column from xs, inputs it to the ** Embedding layer **, and stores the output in ** out (N, T, D) **. It repeats T times in a loop.
Before looking at the TimeRNN layer, let's look at the RNN layer used for the TimeRNN layer.
class RNN:
def __init__(self, Wx, Wh, b):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.cache = None
def forward(self, x, h_prev):
Wx, Wh, b = self.params
t = np.dot(h_prev, Wh) + np.dot(x, Wx) + b
h_next = np.tanh(t)
self.cache = (x, h_prev, h_next)
return h_next
The RNN layer has two weights. It is a weight ** W_x ** that takes the input x_t and the dot product (MatMul) and a weight ** W_h ** that takes the input h_prev and the dot product (MatMUl).
def backward(self, dh_next):
Wx, Wh, b = self.params
x, h_prev, h_next = self.cache
dt = dh_next * (1 - h_next ** 2)
db = np.sum(dt, axis=0)
dWh = np.dot(h_prev.T, dt)
dh_prev = np.dot(dt, Wh.T)
dWx = np.dot(x.T, dt)
dx = np.dot(dt, Wx.T)
self.grads[0][...] = dWx
self.grads[1][...] = dWh
self.grads[2][...] = db
return dx, dh_prev
Backpropagation looks like this. It's a modified version of Affine, so there's nothing complicated about it.
class TimeRNN:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.dh = None, None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
N, T, D = xs.shape
D, H = Wx.shape
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
for t in range(T):
layer = RNN(*self.params)
self.h = layer.forward(xs[:, t, :], self.h)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
** TimeRNN layer ** is a network that connects T RNN layers. Allows you to adjust whether to inherit the state h between blocks with the argument stateful
.
For forward propagation, first prepare the output container hs (N, T, H)
. Then, while rotating the for loop, the t-th data is cut out by xs [:, t,:]
and input to the normal RNN, and the output is the container prepared by hs [:, t,:]
. While storing it in the specified position of, register the layer in layers.
In other words, the TimeRNN layer is the input / output of the RNN layer with the added functions of data extraction and data summarization.
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D, H = Wx.shape
dxs = np.empty((N, T, D), dtype='f')
dh = 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh = layer.backward(dhs[:, t, :] + dh) #Combined gradient
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h):
self.h = h
def reset_state(self):
self.h = None
TimeRNN forward propagation has two outputs, so in the case of back propagation, $ dh_t + dh_ {next} $, which is the sum of the two, is input.
First, create a container dxs that flows downstream, find the gradient dx at each time with backward () of the RNN layer in the reverse order of forward propagation, and assign it to the corresponding index of dxs. The weights parameter adds the weight gradients for each layer and overwrites the final result in self.grads.
class TimeAffine:
def __init__(self, W, b):
self.params = [W, b]
self.grads = [np.zeros_like(W), np.zeros_like(b)]
self.x = None
def forward(self, x):
N, T, D = x.shape
W, b = self.params
rx = x.reshape(N*T, -1)
out = np.dot(rx, W) + b
self.x = x
return out.reshape(N, T, -1)
def backward(self, dout):
x = self.x
N, T, D = x.shape
W, b = self.params
dout = dout.reshape(N*T, -1)
rx = x.reshape(N*T, -1)
db = np.sum(dout, axis=0)
dW = np.dot(rx.T, dout)
dx = np.dot(dout, W.T)
dx = dx.reshape(*x.shape)
self.grads[0][...] = dW
self.grads[1][...] = db
return dx
The ** Time Affine layer ** is the input and output of the ** Affine layer ** with reshape added so that it can correspond to T in the time axis direction.
8.TimeSoftmaxWithLoss
class TimeSoftmaxWithLoss:
def __init__(self):
self.params, self.grads = [], []
self.cache = None
self.ignore_label = -1
def forward(self, xs, ts):
N, T, V = xs.shape
if ts.ndim == 3: #Teacher label is one-For hot vector
ts = ts.argmax(axis=2)
mask = (ts != self.ignore_label)
#Collect batch and time series (reshape)
xs = xs.reshape(N * T, V)
ts = ts.reshape(N * T)
mask = mask.reshape(N * T)
ys = softmax(xs)
ls = np.log(ys[np.arange(N * T), ts])
ls *= mask # ignore_Data corresponding to label sets loss to 0
loss = -np.sum(ls)
loss /= mask.sum()
self.cache = (ts, ys, mask, (N, T, V))
return loss
def backward(self, dout=1):
ts, ys, mask, (N, T, V) = self.cache
dx = ys
dx[np.arange(N * T), ts] -= 1
dx *= dout
dx /= mask.sum()
dx *= mask[:, np.newaxis] # ignore_Set the gradient to 0 for the data corresponding to label
dx = dx.reshape((N, T, V))
return dx
The ** Time Softmax with Loss layer ** is a layer that adds T of Sotmax with Loss of $ x_t and t_t $ and divides by T.
Let's try it with a Japanese dataset to get a better understanding of the whole thing. However, if you try to do it word by word, morphological analysis is required, so ** character unit **. This time, I downloaded "The Old Man and the Sea" from Aozora Bunko.
import numpy as np
import io
def load_data():
# file_UTF the first 1000 characters of name-Read into text in 8 format
file_name = './data_rojinto_umi.txt'
length = 1000
with io.open(file_name, encoding='utf-8') as f:
text = f.read().lower()
text = text[:length]
# word_to_id, id_to_word creation
word_to_id, id_to_word = {}, {}
for word in text:
if word not in word_to_id:
new_id = len(word_to_id)
word_to_id[word] = new_id
id_to_word[new_id] = word
#Creating corpus
corpus = np.array([word_to_id[W] for W in text])
return text, corpus, word_to_id, id_to_word
Reads 1000 characters from the beginning of the text file specified by file_name
in UTF-8 format. A function that returns text, corpus, word_to_id, id_to_word. Let's move it a little.
text, corpus, word_to_id, id_to_word = load_data()
print('text_length = ', len(text))
print(text)
The length of text
is 1000 as specified. If you prepare it in character units, the text is very compact.
print('vocab_size = ', len(word_to_id))
print(word_to_id)
It is word_to_id
. vocab_size
is not as big as I thought it was 236. This is used to replace each character of text
with id to create corpus
.
print('corpus_length = ', len(corpus))
print(corpus[:500])
It is corpus. The display is limited to 500 characters from the beginning.
text2 = ''.join([id_to_word[id] for id in corpus])
print(text2)
Converting the id of corpus
to a character using ʻid_to_word` will return to the first text like this.
Now, since it's a big deal, I'd like to prepare test data and answers and check how much prediction can be made for each epoch.
#Sample from corpus
x = corpus[:50]
t = corpus[1:51]
print('x = ', x)
print('t = ', t)
#Confirmation by letters
text_x = ''.join([id_to_word[id] for id in x])
text_t = ''.join([id_to_word[id] for id in t])
print('text_x = ', text_x)
print('text_t = ', text_t)
#Convert to batch format
test_x = x.reshape(10, 5)
test_t = t.reshape(10, 5)
print(test_x)
print(test_t)
Get x, t from corpus
by shifting 50 characters by one character. For the time being, I convert it to characters and check the contents. Then, it is converted to the shape (10, 5) used in the model, and the prediction data test_x and test_t for testing are created.
def generate(self, xs):
for layer in self.layers:
xs = layer.forward(xs)
return xs
Later, I'll add this code at the end of simple_rnnlm.py to make predictions for each epoch.
Then, based on the code ch05 / train_custom_loop.py that was executed first, modify and add the two parts ** reading the training data and executing the inference of the test data **. The number of epochs is 1000 times.
import sys
sys.path.append('..')
import matplotlib.pyplot as plt
import numpy as np
from common.optimizer import SGD
from dataset import ptb
from simple_rnnlm import SimpleRnnlm
#Hyperparameter settings
batch_size = 10
wordvec_size = 100
hidden_size = 100
time_size = 5 #Time size for Truncated BPTT deployment
lr = 0.1
max_epoch = 1000
# -----------Reading training data-------------
text, corpus, word_to_id, id_to_word = load_data()
corpus_size = 1000
vocab_size = int(max(corpus) + 1)
# ----------------------------------------------
xs = corpus[:-1] #input
ts = corpus[1:] #Output (teacher label)
data_size = len(xs)
print('corpus size: %d, vocabulary size: %d' % (corpus_size, vocab_size))
#Variables used during learning
max_iters = data_size // (batch_size * time_size)
time_idx = 0
total_loss = 0
loss_count = 0
ppl_list = []
#Model generation
model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
#Calculate the loading start position of each sample in the mini-batch
jump = (corpus_size - 1) // batch_size
offsets = [i * jump for i in range(batch_size)]
for epoch in range(max_epoch):
for iter in range(max_iters):
#Get a mini batch
batch_x = np.empty((batch_size, time_size), dtype='i')
batch_t = np.empty((batch_size, time_size), dtype='i')
for t in range(time_size):
for i, offset in enumerate(offsets):
batch_x[i, t] = xs[(offset + time_idx) % data_size]
batch_t[i, t] = ts[(offset + time_idx) % data_size]
time_idx += 1
#Find the gradient and update the parameters
loss = model.forward(batch_x, batch_t)
model.backward()
optimizer.update(model.params, model.grads)
total_loss += loss
loss_count += 1
#Evaluation of perplexity for each epoch
ppl = np.exp(total_loss / loss_count)
print('| epoch %d | perplexity %.2f'
% (epoch+1, ppl))
ppl_list.append(float(ppl))
total_loss, loss_count = 0, 0
# ----------Predicted with test data------------
pred= model.generate(test_x)
predict = np.argmax(pred, axis = 2)
print(predict)
# ------------------------------------------------
#Drawing a graph
x = np.arange(len(ppl_list))
plt.plot(x, ppl_list, label='train')
plt.xlabel('epochs')
plt.ylabel('perplexity')
plt.show()
After 1000 epoch, the perplexity dropped to 1.08. Since the prediction result based on the test data is displayed for each epoch, the prediction result after 1 epoch is all 5 (hiragana "ta"). When you just learned, it looks like this. This, the prediction result after 1000 epoch is quite like that. Now let's check the final forecast result.
The answers to the prediction results are matched and the correct answer is circled in red. Since it is 24/50, the correct answer rate is 48%, which is not as high as I expected. Is it because the correct answer rate in the latter half of the five characters is high because it can be predicted based on the characters in the first half? If you try to express the prediction result of the first line with a model,
I see. Did you make a mistake like this?
Recommended Posts