During the stay home period ["Deep learning from scratch ② Natural language processing"](https://www.amazon.co.jp/%E3%82%BC%E3%83%AD%E3%81%8B% E3% 82% 89% E4% BD% 9C% E3% 82% 8BDeep-Learning-% E2% 80% 95% E8% 87% AA% E7% 84% B6% E8% A8% 80% E8% AA% 9E % E5% 87% A6% E7% 90% 86% E7% B7% A8-% E6% 96% 8E% E8% 97% A4-% E5% BA% B7% E6% AF% 85 / dp / 4873118360) read. I managed to get to the end, but there aren't many application examples in this text. So, let's use the text code to create a spam filter (document classification model). This study was based on Qiita's article Sentence classification model by RNN made from scratch.
Use the "SMS Spam Collection Dataset" in Kaggle.
--Document classification by LSTM --Leveraging the code in Chapter 6 of the text --The hidden state vector h from the last LSTM is binarized by Affine transformation and normalized by the Softmax function.
--Preparation for Google Colab
# coding: utf-8
from google.colab import drive
drive.mount('/content/drive')
--Module import
import sys
sys.path.append('drive/My Drive/Colab Notebooks/spam_filter')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
%matplotlib inline
--Read CSV file with pandas and display the first 5 lines
The first column is the label (ham or spam), the second column is the message, and the 3rd to 5th columns are blank rows.
It seems that many spam messages are like "Big hit! Contact xxx immediately".
df = pd.read_csv('drive/My Drive/Colab Notebooks/spam_filter/dataset/spam.csv',encoding='latin-1')
df.head()
--Delete blank lines and display information
The total number of messages is 5772
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()
--Total number of ham and spam
ham is about 6 times more than spam
sns.countplot(df.v1)
plt.xlabel('Label')
plt.title('Number of ham and spam messages')
--Label encoding with scikit-learn --Tokenize messages with keras Tokenizer
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X)
word_to_id = tok.word_index
X_ids = tok.texts_to_sequences(X)
X_ids_pad = sequence.pad_sequences(X_ids,maxlen=max_len)
--Display the number of words in each message as a histogram
The maximum is about 100 words, and Spam is often longer.
message_len = [len(v) for v in X_ids]
df['message_len']=message_len
plt.figure(figsize=(12, 8))
df[df.v1=='ham'].message_len.plot(bins=35, kind='hist', color='blue',
label='Ham messages', alpha=0.6)
df[df.v1=='spam'].message_len.plot(kind='hist', color='red',
label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")
--Definition of sigmoid function, softmax function, cross_entropy_error function
No change from text.
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def softmax(x):
if x.ndim == 2:
x = x - x.max(axis=1, keepdims=True)
x = np.exp(x)
x /= x.sum(axis=1, keepdims=True)
elif x.ndim == 1:
x = x - np.max(x)
x = np.exp(x) / np.sum(np.exp(x))
return x
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
#Teacher data is one-hot-In case of vector, convert to index of correct label
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
--Definition of each layer of Affine, Softmax, SoftmaxWithLoss, Embedding
No change from text.
class Affine:
def __init__(self, W, b):
self.params = [W, b]
self.grads = [np.zeros_like(W), np.zeros_like(b)]
self.x = None
def forward(self, x):
W, b = self.params
out = np.dot(x, W) + b
self.x = x
return out
def backward(self, dout):
W, b = self.params
dx = np.dot(dout, W.T)
dW = np.dot(self.x.T, dout)
db = np.sum(dout, axis=0)
self.grads[0][...] = dW
self.grads[1][...] = db
return dx
class Softmax:
def __init__(self):
self.params, self.grads = [], []
self.out = None
def forward(self, x):
self.out = softmax(x)
return self.out
def backward(self, dout):
dx = self.out * dout
sumdx = np.sum(dx, axis=1, keepdims=True)
dx -= self.out * sumdx
return dx
class SoftmaxWithLoss:
def __init__(self):
self.params, self.grads = [], []
self.y = None #softmax output
self.t = None #Teacher label
def forward(self, x, t):
self.t = t
self.y = softmax(x)
#Teacher label is one-For hot vector, convert to correct index
if self.t.size == self.y.size:
self.t = self.t.argmax(axis=1)
loss = cross_entropy_error(self.y, self.t)
return loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx = self.y.copy()
dx[np.arange(batch_size), self.t] -= 1
dx *= dout
dx = dx / batch_size
return dx
class Embedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.idx = None
def forward(self, idx):
W, = self.params
self.idx = idx
out = W[idx]
return out
def backward(self, dout):
dW, = self.grads
dW[...] = 0
np.add.at(dW, self.idx, dout)
return None
--Definition of each layer of TimeEmbedding, LSTM, TimeLSTM
No change from text.
class TimeEmbedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.layers = None
self.W = W
def forward(self, xs):
N, T = xs.shape
V, D = self.W.shape
out = np.empty((N, T, D), dtype='f')
self.layers = []
for t in range(T):
layer = Embedding(self.W)
out[:, t, :] = layer.forward(xs[:, t])
self.layers.append(layer)
return out
def backward(self, dout):
N, T, D = dout.shape
grad = 0
for t in range(T):
layer = self.layers[t]
layer.backward(dout[:, t, :])
grad += layer.grads[0]
self.grads[0][...] = grad
return None
class LSTM:
def __init__(self, Wx, Wh, b):
'''
Parameters
----------
Wx:input`x`Weight parameter for (combines 4 weights)
Wh:Hidden state`h`Weight parameter for (summarize the weights for 4)
b:Bias (summarize 4 biases)
'''
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.cache = None
def forward(self, x, h_prev, c_prev):
Wx, Wh, b = self.params
N, H = h_prev.shape
A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b
f = A[:, :H]
g = A[:, H:2*H]
i = A[:, 2*H:3*H]
o = A[:, 3*H:]
f = sigmoid(f)
g = np.tanh(g)
i = sigmoid(i)
o = sigmoid(o)
c_next = f * c_prev + g * i
h_next = o * np.tanh(c_next)
self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
return h_next, c_next
def backward(self, dh_next, dc_next):
Wx, Wh, b = self.params
x, h_prev, c_prev, i, f, g, o, c_next = self.cache
tanh_c_next = np.tanh(c_next)
ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
dc_prev = ds * f
di = ds * g
df = ds * c_prev
do = dh_next * tanh_c_next
dg = ds * i
di *= i * (1 - i)
df *= f * (1 - f)
do *= o * (1 - o)
dg *= (1 - g ** 2)
dA = np.hstack((df, dg, di, do))
dWh = np.dot(h_prev.T, dA)
dWx = np.dot(x.T, dA)
db = dA.sum(axis=0)
self.grads[0][...] = dWx
self.grads[1][...] = dWh
self.grads[2][...] = db
dx = np.dot(dA, Wx.T)
dh_prev = np.dot(dA, Wh.T)
return dx, dh_prev, dc_prev
class TimeLSTM:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.c = None, None
self.dh = None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
N, T, D = xs.shape
H = Wh.shape[0]
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
if not self.stateful or self.c is None:
self.c = np.zeros((N, H), dtype='f')
for t in range(T):
layer = LSTM(*self.params)
self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D = Wx.shape[0]
dxs = np.empty((N, T, D), dtype='f')
dh, dc = 0, 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h, c=None):
self.h, self.c = h, c
def reset_state(self):
self.h, self.c = None, None
--Definition of Rnnlm class
The hidden state vector h that appears at the end is Affine-transformed, binarized, and normalized by the Softmax function.
class Rnnlm():
def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100, out_size=2):
V, D, H, O = vocab_size, wordvec_size, hidden_size, out_size
rn = np.random.randn
#Weight initialization
embed_W = (rn(V, D) / 100).astype('f')
lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_b = np.zeros(4 * H).astype('f')
affine_W = (rn(H, O) / np.sqrt(H)).astype('f')
affine_b = np.zeros(O).astype('f')
#Layer generation
self.embed_layer = TimeEmbedding(embed_W)
self.lstm_layer = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
self.affine_layer = Affine(affine_W, affine_b)
self.loss_layer = SoftmaxWithLoss()
self.softmax_layer = Softmax()
#List all weights and gradients
self.params = self.embed_layer.params + self.lstm_layer.params + self.affine_layer.params
self.grads = self.embed_layer.grads + self.lstm_layer.grads + self.affine_layer.grads
def predict(self, xs):
self.reset_state()
xs = self.embed_layer.forward(xs)
hs = self.lstm_layer.forward(xs)
xs = self.affine_layer.forward(hs[:,-1,:]) #Affine transformation of the last hidden layer
score = self.softmax_layer.forward(xs)
return score
def forward(self, xs, t):
xs = self.embed_layer.forward(xs)
hs = self.lstm_layer.forward(xs)
x = self.affine_layer.forward(hs[:,-1,:]) #Affine transformation of the last hidden layer
loss = self.loss_layer.forward(x, t)
self.hs = hs
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
dhs = np.zeros_like(self.hs)
dhs[:,-1,:] = self.affine_layer.backward(dout) #Set error back propagation of Affine transformation in the last hidden layer
dout = self.lstm_layer.backward(dhs)
dout = self.embed_layer.backward(dout)
return dout
def reset_state(self):
self.lstm_layer.reset_state()
--Definition of SGD as Optimizer
No change from text
class SGD:
'''
Stochastic Gradient Descent
'''
def __init__(self, lr=0.01):
self.lr = lr
def update(self, params, grads):
for i in range(len(params)):
params[i] -= self.lr * grads[i]
--Separate data into training data (85%) and exam data (15%)
X_train,X_test,Y_train,Y_test = train_test_split(X_ids_pad,Y,test_size=0.15)
--Settings such as hyperparameters
#Hyperparameter settings
vocab_size = len(word_to_id)+1
batch_size = 20
wordvec_size = 100
hidden_size = 100
out_size = 2 #Binary problem of ham and spam
lr = 1.0
max_epoch = 10
data_size = len(X_train)
#Variables used during learning
max_iters = data_size // batch_size
#Need to convert to Numpy array
x = np.array(X_train)
t = np.array(Y_train)
--Learning --Process 20 messages at a time in a mini-batch --The Truncated BPTT in the text is not applied.
total_loss = 0
loss_count = 0
loss_list = []
#Model generation
model = Rnnlm(vocab_size, wordvec_size, hidden_size, out_size)
optimizer = SGD(lr)
for epoch in range(max_epoch):
for iter in range(max_iters):
#Get a mini batch
batch_x = x[iter*batch_size:(iter+1)*batch_size]
batch_t = t[iter*batch_size:(iter+1)*batch_size]
#Find the gradient and update the parameters
loss = model.forward(batch_x, batch_t)
model.backward()
optimizer.update(model.params, model.grads)
total_loss += loss
loss_count += 1
avg_loss = total_loss / loss_count
print("| epoch %d | loss %.5f" % (epoch+1, avg_loss))
loss_list.append(float(avg_loss))
total_loss, loss_count = 0,0
x = np.arange(len(loss_list))
plt.plot(x, loss_list, label='train')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.show()
--Inference of test data
result = model.predict(X_test)
Y_pred = result.argmax(axis=1)
--Correct answer rate
98%!
Not bad compared to other Kaggle notebooks.
# calculate accuracy of class predictions
print('acc=',metrics.accuracy_score(Y_test, Y_pred))
--Confusion matrix
# print the confusion matrix
print(metrics.confusion_matrix(Y_test, Y_pred))
This time, I was able to deepen my understanding of the text by trial and error for creating this tool.
If you read Deep Learning ②, which is also made from scratch, we recommend that you use the sample program to create some kind of app.
Judgment by self-made sms
The first is to invite you to go see a baseball game together.
The second is self-made Spam (you don't have to translate it).
Surprisingly? I can judge it properly.
texts_add = ["I'd like to watch baseball game with you. I'm wating for your answer.",
"Do you want to meet new sex partners every night? Feel free to call 09077xx0721."
]
X_ids_add = tok.texts_to_sequences(texts_add)
X_ids_pad_add = sequence.pad_sequences(X_ids_add,maxlen=max_len)
result = model.predict(X_ids_pad_add)
Y_pred = result.argmax(axis=1)
print(Y_pred)
Recommended Posts