I'm reading a masterpiece, ** "Deep Learning from Zero 2" **. ** GRU ** is introduced in the appendix in this book, but there is no actual scene to move it. This time, as we did in Chapter 6, we used the ** BetterRnnlm class ** to let ** GRU ** learn the order of the words in the ** PTB dataset **, and the degree of learning with perplexity. I would like to measure.
common / time_layers.py
, but they cannot be used as they are in the BetterRnnlm class because they are implemented with priority on comprehensibility. Also, the bias term is not taken into account.This is a GRU calculation graph. There are no storage cells in the LSTM, and only h in the hidden state propagates in the time direction. There are two gates, ** reset gate ** and ** update gate **.
The ** reset gate ** determines how much past hidden states are ignored. If r is zero, $ h_ {hat} $ is determined from the input only, ignoring past hidden states.
The ** update gate ** doubles as the LSTM forget gate and input gate. The part that acts as the forget gate is $ (1-z) \ odot h_ {t-1} $. This calculation erases information that should be forgotten from past hidden states.
And it is the $ z \ odot h_ {hat} $ part that acts as the input gate. This calculation weights the newly added information.
Now, let's sort out the weights and biases before implementing.
Wxz, Wxr, Wxh together ** Wx ** (D × 3H), Whz, Whr, Whh together ** Wh ** (H × 3H), bz, br, bh together ** b * * (3H).
from common.np import * # import numpy as np (or import cupy as np)
from common.layers import *
from common.functions import softmax, sigmoid
class GRU:
def __init__(self, Wx, Wh, b):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] ###
self.cache = None
def forward(self, x, h_prev):
Wx, Wh, b = self.params
H = Wh.shape[0]
Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
bz, br, bh = b[:H], b[H:2 * H], b[2 * H:]
z = sigmoid(np.dot(x, Wxz) + np.dot(h_prev, Whz) + bz)
r = sigmoid(np.dot(x, Wxr) + np.dot(h_prev, Whr) + br)
h_hat = np.tanh(np.dot(x, Wxh) + np.dot(r*h_prev, Whh) + bh)
h_next = (1-z) * h_prev + z * h_hat
self.cache = (x, h_prev, z, r, h_hat)
return h_next
The parameters are handled in the form of ** self.params ** and the gradients are handled in the form of ** self.grad ** so that they can be handled as they are in the BetterRnnlm class. Individual values can be divided by H width.
Now, it's a little complicated back propagation. First, it is the part that decomposes ** self.params ** to get each value, and restores the other states from ** cache **.
def backward(self, dh_next):
Wx, Wh, b = self.params
H = Wh.shape[0]
Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
x, h_prev, z, r, h_hat = self.cache
From here, we will implement backpropagation by dividing it into four parts. First, the part that has nothing to do with Tanh and the two sigmoids.
dh_hat =dh_next * z
dh_prev = dh_next * (1-z)
It's a simple combination of + and ×. Next is around tanh.
# tanh
dt = dh_hat * (1 - h_hat ** 2)
dbh = np.sum(dt, axis=0)
dWhh = np.dot((r * h_prev).T, dt)
dhr = np.dot(dt, Whh.T)
dWxh = np.dot(x.T, dt)
dx = np.dot(dt, Wxh.T)
dh_prev += r * dhr
Since dh_prev was calculated earlier, from here on, dh_prev + = will be added to the result. Next is around z of the update gate.
# update gate(z)
dz = dh_next * h_hat - dh_next * h_prev
dt = dz * z * (1-z)
dbz = np.sum(dt, axis=0)
dWhz = np.dot(h_prev.T, dt)
dh_prev += np.dot(dt, Whz.T)
dWxz = np.dot(x.T, dt)
dx += np.dot(dt, Wxz.T)
Since dx has already been calculated with tanh, from here on, dx + = will be added to the result. Next is around r of the Reset gate.
# rest gate(r)
dr = dhr * h_prev
dt = dr * r * (1-r)
dbr = np.sum(dt, axis=0)
dWhr = np.dot(h_prev.T, dt)
dh_prev += np.dot(dt, Whr.T)
dWxr = np.dot(x.T, dt)
dx += np.dot(dt, Wxr.T)
Now that we have calculated each gradient, we will summarize it in grads.
self.dWx = np.hstack((dWxz, dWxr, dWxh))
self.dWh = np.hstack((dWhz, dWhr, dWhh))
self.db = np.hstack((dbz, dbr, dbh))
self.grads[0][...] = self.dWx
self.grads[1][...] = self.dWh
self.grads[2][...] = self.db
return dx, dh_prev
At this point, GRU implementation is complete.
For forward propagation of ** TimeGRU **, 3D data ** xs ** is cut out and input to ** GRU ** every hour, and the output from ** GRU ** is output to 3D data ** again. It is summarized in hs **.
class TimeGRU:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.dh = None, None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
H = Wh.shape[0]
N, T, D = xs.shape
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
for t in range(T):
layer = GRU(*self.params)
self.h = layer.forward(xs[:, t, :], self.h)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
Prepare a box ** hs ** (N, T, H) to store the output. Also, prepare a zero matrix ** self.h ** (N, H) if necessary. Then, cut out one hour from the data ** xs **, input it to ** GRU **, and store the output ** self.h ** from GRU in ** hs **. At the same time, append layer by time T minutes (this is used in backward).
Now, the back propagation of TimeGRU. During backpropagation, $ dh_t + dh_ {next} $ is entered in the GRU layer.
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D = Wx.shape[0]
dxs = np.empty((N, T, D), dtype='f')
dh = 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh = layer.backward(dhs[:, t, :] + dh)
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h):
self.h = h
def reset_state(self):
self.h = None
Prepare a box *** dxs *** (N, T, D) to store the backpropagation output. Also, prepare a list ** gradients ** to temporarily store the gradients.
** Cut out one hour from dhs + Gradient from one future dh ** is input, and the GRU layer appended by forward is called in reverse order and backward is applied. Then, the backward result ** dx ** is stored in ** dxs **.
In the expression dx, dh = layer.backward (dhs [:, t,:] + dh)
, dh on the right side is so-called $ dh_ {next} $ and dh on the left side is so-called $ dh_ {prev} $.
Then add the weight gradients at each layer and summarize the final result in ** self.grads **.
Now that the implementation of GRU and TimeGRU is complete, create a folder called ch09
and save it with the file name time_layers_gru.py
.
Next, modify better_rnnlm.py
to generate the network model.
import sys
sys.path.append('..')
from common.time_layers import TimeEmbedding, TimeAffine, TimeSoftmaxWithLoss, TimeDropout #Specify the layer to read
from time_layers_gru import * #Only GRU is read from here
from common.np import * # import numpy as np
from common.base_model import BaseModel
class BetterRnnlm(BaseModel):
def __init__(self, vocab_size=10000, wordvec_size=650,
hidden_size=650, dropout_ratio=0.5):
V, D, H = vocab_size, wordvec_size, hidden_size
rn = np.random.randn
embed_W = (rn(V, D) / 100).astype('f')
gru_Wx1 = (rn(D, 3 * H) / np.sqrt(D)).astype('f')
gru_Wh1 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_b1 = np.zeros(3 * H).astype('f')
gru_Wx2 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_Wh2 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_b2 = np.zeros(3 * H).astype('f')
affine_b = np.zeros(V).astype('f')
self.layers = [
TimeEmbedding(embed_W),
TimeDropout(dropout_ratio),
TimeGRU(gru_Wx1, gru_Wh1, gru_b1, stateful=True),
TimeDropout(dropout_ratio),
TimeGRU(gru_Wx2, gru_Wh2, gru_b2, stateful=True),
TimeDropout(dropout_ratio),
TimeAffine(embed_W.T, affine_b)
]
self.loss_layer = TimeSoftmaxWithLoss()
self.gru_layers = [self.layers[2], self.layers[4]]
self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]
self.params, self.grads = [], []
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
def predict(self, xs, train_flg=False):
for layer in self.drop_layers:
layer.train_flg = train_flg
for layer in self.layers:
xs = layer.forward(xs)
return xs
def forward(self, xs, ts, train_flg=True):
score = self.predict(xs, train_flg)
loss = self.loss_layer.forward(score, ts)
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
for layer in reversed(self.layers):
dout = layer.backward(dout)
return dout
def reset_state(self):
for layer in self.gru_layers:
layer.reset_state()
At the beginning, only the specified layer is imported from common / time_layers.py
, and GRU is changed to import from time_layers_gru.py
saved in the current directory earlier.
After that, change the LSTM part of the code to GRU. The weight is reduced from 4 to 3, so for example (D, 3 *) in
gru_Wx1 = (rn (D, 3 * H) /np.sqrt(D)). astype ('f') Don't forget to modify the part related to the number of weights like H)
.
Save this code in the ch09
folder as better_rnnlm_gru.py
.
Based on the learning code in Chapter 6, change the from better_rnnlm import BetterRnnlm
at the beginning to from better_rnnlm_gru import BetterRnnlm
and save it in the ch09
folder with the file name train_better_rnnlm.py
.
When I ran it with the hyperparameter lr = 20, there was a lot of variation in perplixity at the initial stage, so I changed it to lr = 10 and ran it again.
import sys
sys.path.append('..')
from common import config
#When executing on GPU, delete the comment out below (cupy required)
# ==============================================
config.GPU = True
# ==============================================
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity, to_gpu
from dataset import ptb
from better_rnnlm_gru import BetterRnnlm #Change
#Hyperparameter settings
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 10
max_epoch = 40
max_grad = 0.25
dropout = 0.5
#Reading training data
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_val, _, _ = ptb.load_data('val')
corpus_test, _, _ = ptb.load_data('test')
if config.GPU:
corpus = to_gpu(corpus)
corpus_val = to_gpu(corpus_val)
corpus_test = to_gpu(corpus_test)
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]
model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)
best_ppl = float('inf')
for epoch in range(max_epoch):
trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size,
time_size=time_size, max_grad=max_grad)
model.reset_state()
ppl = eval_perplexity(model, corpus_val)
print('valid perplexity: ', ppl)
if best_ppl > ppl:
best_ppl = ppl
model.save_params()
else:
lr /= 4.0
optimizer.lr = lr
model.reset_state()
print('-' * 50)
#Evaluation with test data
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('test perplexity: ', ppl_test)
The test perplexity of the LSTM model in Chapter 6 was in the high 70s, but the GRU model seems to stay in the low 80s. For a long corpus of over 900,000 words, such as this dataset, the LSTM model with memory cells seems to be more advantageous.
Recommended Posts