I'm reading a masterpiece, ** "Deep Learning from Zero 2" **. This time is a memo of Chapter 6. To execute the code, download the entire code from Github and use jupyter notebook in ch06.
The code for ch06 / train_rnnlm.py that learns the word order of the PTB dataset. In the simple RNN model in Chapter 5, we learned only the first 1000 words of the training data set, but this time we will learn about 900,000 words for the entire training data set.
Run with time_size = 35, batch_size = 20, word_size = hidden_size = 100, max_eopch = 4.
import sys
sys.path.append('..')
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity
from dataset import ptb
from rnnlm import Rnnlm
#Hyperparameter settings
batch_size = 20
wordvec_size = 100
hidden_size = 100 #Number of elements of hidden state vector of RNN
time_size = 35 #The size to deploy the RNN
lr = 20.0
max_epoch = 4
max_grad = 0.25
#Reading training data
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_test, _, _ = ptb.load_data('test')
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]
#Model generation
model = Rnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)
#Learn by applying gradient clipping
trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad,
eval_interval=20)
trainer.plot(ylim=(0, 500))
#Evaluate with test data
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('test perplexity: ', ppl_test)
#Save parameters
model.save_params()
It took 33 minutes to finish on my Macbook Air. The perplexity after 4 epoch was 111.47, and the perplexity according to the test data was 136.3. If you predict the next word, you have 136 choices. The conditions are completely different from what I did in Chapter 5, so I can't compare it, but it's bigger than I expected.
In the simple RNN model of Chapter 5, gradient explosion is likely to occur, so in the LSTM model, in order to suppress it, the trainer class is used.Gradient clipping( -threshold < ||
import sys
sys.path.append('..')
from common.time_layers import *
from common.base_model import BaseModel
class Rnnlm(BaseModel):
def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100):
V, D, H = vocab_size, wordvec_size, hidden_size
rn = np.random.randn
#Weight initialization
embed_W = (rn(V, D) / 100).astype('f')
lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_b = np.zeros(4 * H).astype('f')
affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
affine_b = np.zeros(V).astype('f')
#Layer generation
self.layers = [
TimeEmbedding(embed_W),
TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True),
TimeAffine(affine_W, affine_b)
]
self.loss_layer = TimeSoftmaxWithLoss()
self.lstm_layer = self.layers[1]
#List all weights and gradients
self.params, self.grads = [], []
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
def predict(self, xs):
for layer in self.layers:
xs = layer.forward(xs)
return xs
def forward(self, xs, ts):
score = self.predict(xs)
loss = self.loss_layer.forward(score, ts)
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
for layer in reversed(self.layers):
dout = layer.backward(dout)
return dout
def reset_state(self):
self.lstm_layer.reset_state()
The layer structure of ** Rnnlm ** simply replaces the Time RNN layer of the model in Chapter 5 with the Time LSTM layer. Before looking at Time LSTMs, let's first look at the LSTM layers used there.
class LSTM:
def __init__(self, Wx, Wh, b):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.cache = None
def forward(self, x, h_prev, c_prev):
Wx, Wh, b = self.params
N, H = h_prev.shape
A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b
f = A[:, :H]
g = A[:, H:2*H]
i = A[:, 2*H:3*H]
o = A[:, 3*H:]
f = sigmoid(f)
g = np.tanh(g)
i = sigmoid(i)
o = sigmoid(o)
c_next = f * c_prev + g * i
h_next = o * np.tanh(c_next)
self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
return h_next, c_next
This is the forward propagation part of the ** LSTM layer **. Since the formulas for the memory to the cell and the forward propagation of the three gates all have the same shape, the weights $ W_x, W_h, and b $ calculate A together, and then slice A to activate it. It is efficient to pass the function.
def backward(self, dh_next, dc_next):
Wx, Wh, b = self.params
x, h_prev, c_prev, i, f, g, o, c_next = self.cache
tanh_c_next = np.tanh(c_next)
ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
dc_prev = ds * f
di = ds * g
df = ds * c_prev
do = dh_next * tanh_c_next
dg = ds * i
di *= i * (1 - i)
df *= f * (1 - f)
do *= o * (1 - o)
dg *= (1 - g ** 2)
dA = np.hstack((df, dg, di, do))
dWh = np.dot(h_prev.T, dA)
dWx = np.dot(x.T, dA)
db = dA.sum(axis=0)
self.grads[0][...] = dWx
self.grads[1][...] = dWh
self.grads[2][...] = db
dx = np.dot(dA, Wx.T)
dh_prev = np.dot(dA, Wh.T)
return dx, dh_prev, dc_prev
For backpropagation, dA can be obtained by finding df, dg, di, and do respectively and connecting them with hstach. After that, since it is the back propagation of MatMul, dWx, dWh, db, dx, dh_prev can be obtained.
5.TimeLSTM
class TimeLSTM:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.c = None, None
self.dh = None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
N, T, D = xs.shape
H = Wh.shape[0]
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
if not self.stateful or self.c is None:
self.c = np.zeros((N, H), dtype='f')
for t in range(T):
layer = LSTM(*self.params)
self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
It's basically the same as for TimeRNN. ** Time LSTM layer ** is a network that connects T LSTM layers. Allows you to adjust whether to inherit the state h between blocks with the argument stateful. There is also cell C to store the memory.
For forward propagation, first prepare a container hs (N, T, H) for output. Then, while turning the for loop, the t-th data is cut out by xs [:, t,:] and input to the normal LSTM, and the output is at the specified position of the container prepared by hs [:, t,:]. As we store it, we will register layers in layers.
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D = Wx.shape[0]
dxs = np.empty((N, T, D), dtype='f')
dh, dc = 0, 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h, c=None):
self.h, self.c = h, c
def reset_state(self):
self.h, self.c = None, None
TimeLSTM forward propagation has three outputs, so in the case of back propagation, $ dh_t + dh_ {next} + dc_ {next} $, which is the sum of those three, is input.
First, create a container dxs that flows downstream, find the gradient dx at each time with backward () of the LSTM layer in the reverse order of forward propagation, and substitute it for the corresponding index of dxs. The weights parameter adds the weight gradients for each layer and overwrites the final result in self.grads.
Now, let's run the improved model train_better_rnnlm.py. As expected, this is hard with CPU alone, so I installed cupy on a windows machine.
Run with time_size = 35, batch_size = 20, ** word_size = hidden_size = 650, max_epoch = 40 **.
import sys
sys.path.append('..')
from common import config
#When executing on GPU, delete the comment out below (cupy required)
# ==============================================
config.GPU = True
# ==============================================
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity, to_gpu
from dataset import ptb
from better_rnnlm import BetterRnnlm
#Hyperparameter settings
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 20.0
max_epoch = 40
max_grad = 0.25
dropout = 0.5
#Reading training data
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_val, _, _ = ptb.load_data('val')
corpus_test, _, _ = ptb.load_data('test')
if config.GPU:
corpus = to_gpu(corpus)
corpus_val = to_gpu(corpus_val)
corpus_test = to_gpu(corpus_test)
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]
model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)
best_ppl = float('inf')
for epoch in range(max_epoch):
trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size,
time_size=time_size, max_grad=max_grad)
model.reset_state()
ppl = eval_perplexity(model, corpus_val)
print('valid perplexity: ', ppl)
if best_ppl > ppl:
best_ppl = ppl
model.save_params()
else:
lr /= 4.0
optimizer.lr = lr
model.reset_state()
print('-' * 50)
#Evaluation with test data
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('test perplexity: ', ppl_test)
It was completed in less than 3 hours on a windows machine (GTX1060). Evaluation by test data after 40 eopch has improved to perplexity = 76-79.
In this code, the perplexity of the test data is calculated for each epch, and the learning coefficient lr is lowered only when the value deteriorates. Let's take a look at Better Rnnlm.
7.BetterRnnlm
import sys
sys.path.append('..')
from common.time_layers import *
from common.np import * # import numpy as np
from common.base_model import BaseModel
class BetterRnnlm(BaseModel):
def __init__(self, vocab_size=10000, wordvec_size=650,
hidden_size=650, dropout_ratio=0.5):
V, D, H = vocab_size, wordvec_size, hidden_size
rn = np.random.randn
embed_W = (rn(V, D) / 100).astype('f')
lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_b1 = np.zeros(4 * H).astype('f')
lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_b2 = np.zeros(4 * H).astype('f')
affine_b = np.zeros(V).astype('f')
self.layers = [
TimeEmbedding(embed_W),
TimeDropout(dropout_ratio),
TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
TimeDropout(dropout_ratio),
TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
TimeDropout(dropout_ratio),
TimeAffine(embed_W.T, affine_b) # weight tying!!
]
self.loss_layer = TimeSoftmaxWithLoss()
self.lstm_layers = [self.layers[2], self.layers[4]]
self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]
self.params, self.grads = [], []
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
def predict(self, xs, train_flg=False):
for layer in self.drop_layers:
layer.train_flg = train_flg
for layer in self.layers:
xs = layer.forward(xs)
return xs
def forward(self, xs, ts, train_flg=True):
score = self.predict(xs, train_flg)
loss = self.loss_layer.forward(score, ts)
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
for layer in reversed(self.layers):
dout = layer.backward(dout)
return dout
def reset_state(self):
for layer in self.lstm_layers:
layer.reset_state()
BetterRnnlm layer structure. There are three features: multi-layered LSTM layers (2), use of Dropout (2), and weight sharing of Time Embedding and Time Affine.
While increasing the expressiveness by multi-layering the LSTM, Time Dropout (the contents are basically the same as Dropout) is included to suppress overfitting. Since the weights of the Time Embeddinh layer are (V, H) and the weights of the Affine layer are (H, v), the learning parameters are reduced by using the transposition of the weights of the Time Embedding layer as the weights of the Affine layer. , Suppresses over-learning and facilitates learning.
Recommended Posts