I'm reading a masterpiece, ** "Deep Learning from Zero 2" **. This time is a memo of Chapter 3. To execute the code, download the entire code from Github and use jupyter notebook in ch03.
Let's run a simple word2vec CBOW model. Execute ch03 / train.py.
import sys
sys.path.append('..') #Settings for importing files in the parent directory
from common.trainer import Trainer
from common.optimizer import Adam
from simple_cbow import SimpleCBOW
from common.util import preprocess, create_contexts_target, convert_one_hot
window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000
#Acquisition of corpus and dictionary
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
#Get context, target
contexts, target = create_contexts_target(corpus, window_size)
#One-hot expression
vocab_size = len(word_to_id)
contexts = convert_one_hot(contexts, vocab_size)
target = convert_one_hot(target, vocab_size)
#Network construction
model = SimpleCBOW(vocab_size, hidden_size)
#Learning and loss transition display
optimizer = Adam()
trainer = Trainer(model, optimizer)
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
#Vector display of words
word_vecs = model.word_vecs
for word_id, word in id_to_word.items():
print(word, word_vecs[word_id])
It's only 7 words word2vec, but I'd be happy if the loss goes down smoothly and we can get a 5D vector for each word. Let's take a look at the code in order.
#Acquisition of corpus and dictionary
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
preprocess ()
is located in common / util.py, so refer to it.
# -------------- from common/util.py ---------------
def preprocess(text):
text = text.lower() #Uppercase to lowercase
text = text.replace('.', ' .') #A blank before the period
words = text.split(' ') #List words separated by whitespace
word_to_id = {}
id_to_word = {}
for word in words: #Word by word from the list
if word not in word_to_id: #Word is word_to_If not in id
new_id = len(word_to_id) # word_to_Set the number of registered ids to id
word_to_id[word] = new_id # word_to_registration of id
id_to_word[new_id] = word # id_to_word registration
corpus = np.array([word_to_id[w] for w in words]) #Convert corpus to id
return corpus, word_to_id, id_to_word
Break down text into words to get corpus. Create a dictionary (words → numbers, numbers → words) and use that dictionary to replace corpus with id. corpus = [0 1 2 3 4 1 5 6] word_to_id = {'you': 0, 'say': 1, 'goodbye': 2, 'and':3 , 'i': 4, 'hello': 5, '.': 6} id_to_word = {0 :'you', 1 :'say', 2 :'goodbye', 3 :'and', 4 :'i', 5 :'hello', 6 : '.'}
#Get context and target
contexts, target = create_contexts_target(corpus, window_size)
create_contexts_target ()
is located in common / util.py, so refer to it.
# -------------- from common/util.py ---------------
def create_contexts_target(corpus, window_size=1):
#target is window before and after corpus_minus size
target = corpus[window_size:-window_size]
contexts = []
#Let contexts be t minutes before and after target
for idx in range(window_size, len(corpus)-window_size): # idx = 1 〜 6
cs = []
for t in range(-window_size, window_size + 1): # t = -1, 0, 1
if t == 0:
continue # t =When it is 0, nothing is done
cs.append(corpus[idx + t]) # cs = courpus[idx-1, idx+1]
contexts.append(cs)
return np.array(contexts), np.array(target)
target is the window_size subtracted from before and after corpus. Then, by putting the position of target in corpus in idx and specifying before and after it with t, we get contexts. contexts = [[[0 2][1 3][2 4][3 1][4 5][1 6]]] target = [1 2 3 4 1 5]
#One-hot expression
vocab_size = len(word_to_id)
contexts = convert_one_hot(contexts, vocab_size)
target = convert_one_hot(target, vocab_size)
convert_one_hot ()
is located in common / util.py, so refer to it.
# -------------- from common/util.py ---------------
def convert_one_hot(corpus, vocab_size):
N = corpus.shape[0]
if corpus.ndim == 1: #In the case of one dimension(For target)
one_hot = np.zeros((N, vocab_size), dtype=np.int32) #Zero matrix creation
for idx, word_id in enumerate(corpus): #target to word_Sequential assignment to id
one_hot[idx, word_id] = 1
elif corpus.ndim == 2: #In the case of 2D(For contexts)
C = corpus.shape[1]
one_hot = np.zeros((N, C, vocab_size), dtype=np.int32) #Zero matrix creation
for idx_0, word_ids in enumerate(corpus): #word from contexts_Sequential assignment to ids
for idx_1, word_id in enumerate(word_ids): # word_words from ids_Sequential assignment to id
one_hot[idx_0, idx_1, word_id] = 1
return one_hot
In the case of ** target **, a zero matrix is created with (N, vocab_size), and the specified part is set to 1 with one_hot [idx, word_id].
In the case of ** contexts **, since it is two-dimensional, a zero matrix is created with (N, C, vocab_size), and the specified part is set to 1 with one_hot [idx_0, idx_1, word_id].
#Network construction
model = SimpleCBOW(vocab_size, hidden_size)
This is the part of network construction. Let's walk through simple_cbow.py, which has the class SimpleCBOW ()
.
# -------------- from simple_cbow.py ---------------
class SimpleCBOW:
def __init__(self, vocab_size, hidden_size):
V, H = vocab_size, hidden_size
#Weight initialization
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(H, V).astype('f')
#Layer generation
self.in_layer0 = MatMul(W_in)
self.in_layer1 = MatMul(W_in)
self.out_layer = MatMul(W_out)
self.loss_layer = SoftmaxWithLoss()
#List all weights and gradients
layers = [self.in_layer0, self.in_layer1, self.out_layer]
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
#Set distributed representation of words in member variables
self.word_vecs = W_in
Since window_size = 1, there are two inputs. The input is 7 one-hot vectors, which is the same as the number of vocabularies, the hidden layer is 5, and the output is 7 which is the same as the number of vocabularies.
** Distribution hypothesis ** Based on "the meaning of a word is formed by surrounding words", if you learn to solve the fill-in-the-blank question of what is a word between two words, $ W_ {in} The $ is a distributed representation of the word.
Finally, the weight W_in is assigned to word_vecs. This is used for vector display of words after learning.
# -------------- from simple_cbow.py ---------------
def forward(self, contexts, target):
h0 = self.in_layer0.forward(contexts[:, 0])
h1 = self.in_layer1.forward(contexts[:, 1])
h = (h0 + h1) * 0.5
score = self.out_layer.forward(h)
loss = self.loss_layer.forward(score, target)
return loss
The weights $ W_ {in} $ for layer0 and layer1 are shared. After adding the signals of layer0 and layer1, divide by 2.
# -------------- from simple_cbow.py ---------------
def backward(self, dout=1):
ds = self.loss_layer.backward(dout)
da = self.out_layer.backward(ds)
da *= 0.5
self.in_layer1.backward(da)
self.in_layer0.backward(da)
return None
Error back propagation. This shouldn't be a problem.
#Learning and loss transition graph display
optimizer = Adam()
trainer = Trainer(model, optimizer)
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
Instantiate class Trainer ()
in common / trainer.py with Adam, the model and optimizer that you built the network earlier. After that, learn with fit and display the loss transition graph with plot.
#Vector display of words
word_vecs = model.word_vecs #Weight W_in(Word vector)Get
for word_id, word in id_to_word.items(): # id_to_Get index and word from word
print(word, word_vecs[word_id]) #Show words and vectors
Finally, we call the learned word vector model.word_vecs
to display the words and vectors.
Recommended Posts