I'm reading a masterpiece, ** "Deep Learning from Zero 2" **. ** GRU ** is introduced in the appendix in this book, but there is no actual scene to move it. This time, as we did in Chapter 6, we used the ** BetterRnnlm class ** to let ** GRU ** learn the order of the words in the ** PTB dataset **, and the degree of learning with perplexity. I would like to measure.
common / time_layers.py, but they cannot be used as they are in the BetterRnnlm class because they are implemented with priority on comprehensibility. Also, the bias term is not taken into account.
This is a GRU calculation graph. There are no storage cells in the LSTM, and only h in the hidden state propagates in the time direction. There are two gates, ** reset gate ** and ** update gate **.
The ** reset gate ** determines how much past hidden states are ignored. If r is zero, $ h_ {hat} $ is determined from the input only, ignoring past hidden states.
The ** update gate ** doubles as the LSTM forget gate and input gate. The part that acts as the forget gate is $ (1-z) \ odot h_ {t-1} $. This calculation erases information that should be forgotten from past hidden states.
And it is the $ z \ odot h_ {hat} $ part that acts as the input gate. This calculation weights the newly added information.
Now, let's sort out the weights and biases before implementing.
Wxz, Wxr, Wxh together ** Wx ** (D × 3H), Whz, Whr, Whh together ** Wh ** (H × 3H), bz, br, bh together ** b * * (3H).
from common.np import * # import numpy as np (or import cupy as np)
from common.layers import *
from common.functions import softmax, sigmoid
class GRU:
def __init__(self, Wx, Wh, b):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] ###
self.cache = None
def forward(self, x, h_prev):
Wx, Wh, b = self.params
H = Wh.shape[0]
Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
bz, br, bh = b[:H], b[H:2 * H], b[2 * H:]
z = sigmoid(np.dot(x, Wxz) + np.dot(h_prev, Whz) + bz)
r = sigmoid(np.dot(x, Wxr) + np.dot(h_prev, Whr) + br)
h_hat = np.tanh(np.dot(x, Wxh) + np.dot(r*h_prev, Whh) + bh)
h_next = (1-z) * h_prev + z * h_hat
self.cache = (x, h_prev, z, r, h_hat)
return h_next
The parameters are handled in the form of ** self.params ** and the gradients are handled in the form of ** self.grad ** so that they can be handled as they are in the BetterRnnlm class. Individual values can be divided by H width.
Now, it's a little complicated back propagation. First, it is the part that decomposes ** self.params ** to get each value, and restores the other states from ** cache **.
def backward(self, dh_next):
Wx, Wh, b = self.params
H = Wh.shape[0]
Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
x, h_prev, z, r, h_hat = self.cache
From here, we will implement backpropagation by dividing it into four parts. First, the part that has nothing to do with Tanh and the two sigmoids.

dh_hat =dh_next * z
dh_prev = dh_next * (1-z)
It's a simple combination of + and ×. Next is around tanh.

# tanh
dt = dh_hat * (1 - h_hat ** 2)
dbh = np.sum(dt, axis=0)
dWhh = np.dot((r * h_prev).T, dt)
dhr = np.dot(dt, Whh.T)
dWxh = np.dot(x.T, dt)
dx = np.dot(dt, Wxh.T)
dh_prev += r * dhr
Since dh_prev was calculated earlier, from here on, dh_prev + = will be added to the result. Next is around z of the update gate.

# update gate(z)
dz = dh_next * h_hat - dh_next * h_prev
dt = dz * z * (1-z)
dbz = np.sum(dt, axis=0)
dWhz = np.dot(h_prev.T, dt)
dh_prev += np.dot(dt, Whz.T)
dWxz = np.dot(x.T, dt)
dx += np.dot(dt, Wxz.T)
Since dx has already been calculated with tanh, from here on, dx + = will be added to the result. Next is around r of the Reset gate.

# rest gate(r)
dr = dhr * h_prev
dt = dr * r * (1-r)
dbr = np.sum(dt, axis=0)
dWhr = np.dot(h_prev.T, dt)
dh_prev += np.dot(dt, Whr.T)
dWxr = np.dot(x.T, dt)
dx += np.dot(dt, Wxr.T)
Now that we have calculated each gradient, we will summarize it in grads.
self.dWx = np.hstack((dWxz, dWxr, dWxh))
self.dWh = np.hstack((dWhz, dWhr, dWhh))
self.db = np.hstack((dbz, dbr, dbh))
self.grads[0][...] = self.dWx
self.grads[1][...] = self.dWh
self.grads[2][...] = self.db
return dx, dh_prev
At this point, GRU implementation is complete.

For forward propagation of ** TimeGRU **, 3D data ** xs ** is cut out and input to ** GRU ** every hour, and the output from ** GRU ** is output to 3D data ** again. It is summarized in hs **.
class TimeGRU:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.dh = None, None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
H = Wh.shape[0]
N, T, D = xs.shape
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
for t in range(T):
layer = GRU(*self.params)
self.h = layer.forward(xs[:, t, :], self.h)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
Prepare a box ** hs ** (N, T, H) to store the output. Also, prepare a zero matrix ** self.h ** (N, H) if necessary. Then, cut out one hour from the data ** xs **, input it to ** GRU **, and store the output ** self.h ** from GRU in ** hs **. At the same time, append layer by time T minutes (this is used in backward).

Now, the back propagation of TimeGRU. During backpropagation, $ dh_t + dh_ {next} $ is entered in the GRU layer.
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D = Wx.shape[0]
dxs = np.empty((N, T, D), dtype='f')
dh = 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh = layer.backward(dhs[:, t, :] + dh)
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h):
self.h = h
def reset_state(self):
self.h = None
Prepare a box *** dxs *** (N, T, D) to store the backpropagation output. Also, prepare a list ** gradients ** to temporarily store the gradients.
** Cut out one hour from dhs + Gradient from one future dh ** is input, and the GRU layer appended by forward is called in reverse order and backward is applied. Then, the backward result ** dx ** is stored in ** dxs **.
In the expression dx, dh = layer.backward (dhs [:, t,:] + dh), dh on the right side is so-called $ dh_ {next} $ and dh on the left side is so-called $ dh_ {prev} $.
Then add the weight gradients at each layer and summarize the final result in ** self.grads **.
Now that the implementation of GRU and TimeGRU is complete, create a folder called ch09 and save it with the file name time_layers_gru.py.
Next, modify better_rnnlm.py to generate the network model.
import sys
sys.path.append('..')
from common.time_layers import TimeEmbedding, TimeAffine, TimeSoftmaxWithLoss, TimeDropout #Specify the layer to read
from time_layers_gru import * #Only GRU is read from here
from common.np import * # import numpy as np
from common.base_model import BaseModel
class BetterRnnlm(BaseModel):
def __init__(self, vocab_size=10000, wordvec_size=650,
hidden_size=650, dropout_ratio=0.5):
V, D, H = vocab_size, wordvec_size, hidden_size
rn = np.random.randn
embed_W = (rn(V, D) / 100).astype('f')
gru_Wx1 = (rn(D, 3 * H) / np.sqrt(D)).astype('f')
gru_Wh1 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_b1 = np.zeros(3 * H).astype('f')
gru_Wx2 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_Wh2 = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
gru_b2 = np.zeros(3 * H).astype('f')
affine_b = np.zeros(V).astype('f')
self.layers = [
TimeEmbedding(embed_W),
TimeDropout(dropout_ratio),
TimeGRU(gru_Wx1, gru_Wh1, gru_b1, stateful=True),
TimeDropout(dropout_ratio),
TimeGRU(gru_Wx2, gru_Wh2, gru_b2, stateful=True),
TimeDropout(dropout_ratio),
TimeAffine(embed_W.T, affine_b)
]
self.loss_layer = TimeSoftmaxWithLoss()
self.gru_layers = [self.layers[2], self.layers[4]]
self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]
self.params, self.grads = [], []
for layer in self.layers:
self.params += layer.params
self.grads += layer.grads
def predict(self, xs, train_flg=False):
for layer in self.drop_layers:
layer.train_flg = train_flg
for layer in self.layers:
xs = layer.forward(xs)
return xs
def forward(self, xs, ts, train_flg=True):
score = self.predict(xs, train_flg)
loss = self.loss_layer.forward(score, ts)
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
for layer in reversed(self.layers):
dout = layer.backward(dout)
return dout
def reset_state(self):
for layer in self.gru_layers:
layer.reset_state()
At the beginning, only the specified layer is imported from common / time_layers.py, and GRU is changed to import from time_layers_gru.py saved in the current directory earlier.
After that, change the LSTM part of the code to GRU. The weight is reduced from 4 to 3, so for example (D, 3 *) in gru_Wx1 = (rn (D, 3 * H) /np.sqrt(D)). astype ('f') Don't forget to modify the part related to the number of weights like H).
Save this code in the ch09 folder as better_rnnlm_gru.py.
Based on the learning code in Chapter 6, change the from better_rnnlm import BetterRnnlm at the beginning to from better_rnnlm_gru import BetterRnnlm and save it in the ch09 folder with the file name train_better_rnnlm.py.
When I ran it with the hyperparameter lr = 20, there was a lot of variation in perplixity at the initial stage, so I changed it to lr = 10 and ran it again.
import sys
sys.path.append('..')
from common import config
#When executing on GPU, delete the comment out below (cupy required)
# ==============================================
config.GPU = True
# ==============================================
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity, to_gpu
from dataset import ptb
from better_rnnlm_gru import BetterRnnlm #Change
#Hyperparameter settings
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 10
max_epoch = 40
max_grad = 0.25
dropout = 0.5
#Reading training data
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_val, _, _ = ptb.load_data('val')
corpus_test, _, _ = ptb.load_data('test')
if config.GPU:
corpus = to_gpu(corpus)
corpus_val = to_gpu(corpus_val)
corpus_test = to_gpu(corpus_test)
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]
model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)
best_ppl = float('inf')
for epoch in range(max_epoch):
trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size,
time_size=time_size, max_grad=max_grad)
model.reset_state()
ppl = eval_perplexity(model, corpus_val)
print('valid perplexity: ', ppl)
if best_ppl > ppl:
best_ppl = ppl
model.save_params()
else:
lr /= 4.0
optimizer.lr = lr
model.reset_state()
print('-' * 50)
#Evaluation with test data
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('test perplexity: ', ppl_test)
The test perplexity of the LSTM model in Chapter 6 was in the high 70s, but the GRU model seems to stay in the low 80s. For a long corpus of over 900,000 words, such as this dataset, the LSTM model with memory cells seems to be more advantageous.
Recommended Posts