1.First of all

During the stay home period ["Deep learning from scratch ② Natural language processing"](https://www.amazon.co.jp/%E3%82%BC%E3%83%AD%E3%81%8B% E3% 82% 89% E4% BD% 9C% E3% 82% 8BDeep-Learning-% E2% 80% 95% E8% 87% AA% E7% 84% B6% E8% A8% 80% E8% AA% 9E % E5% 87% A6% E7% 90% 86% E7% B7% A8-% E6% 96% 8E% E8% 97% A4-% E5% BA% B7% E6% AF% 85 / dp / 4873118360) read. I managed to get to the end, but there aren't many application examples in this text. So, let's use the text code to create a spam filter (document classification model). This study was based on Qiita's article Sentence classification model by RNN made from scratch.

2. Data

Use the "SMS Spam Collection Dataset" in Kaggle.

3. Model overview

--Document classification by LSTM --Leveraging the code in Chapter 6 of the text --The hidden state vector h from the last LSTM is binarized by Affine transformation and normalized by the Softmax function.

4. Implementation

--Preparation for Google Colab

# coding: utf-8
from google.colab import drive
drive.mount('/content/drive')

--Module import

import sys
sys.path.append('drive/My Drive/Colab Notebooks/spam_filter')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
%matplotlib inline

--Read CSV file with pandas and display the first 5 lines
The first column is the label (ham or spam), the second column is the message, and the 3rd to 5th columns are blank rows.
It seems that many spam messages are like "Big hit! Contact xxx immediately".

df = pd.read_csv('drive/My Drive/Colab Notebooks/spam_filter/dataset/spam.csv',encoding='latin-1')
df.head()

出力1.png

--Delete blank lines and display information
The total number of messages is 5772

df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()

出力2.png

--Total number of ham and spam
ham is about 6 times more than spam

sns.countplot(df.v1)
plt.xlabel('Label')
plt.title('Number of ham and spam messages')

頻度.png

--Label encoding with scikit-learn --Tokenize messages with keras Tokenizer

X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)

max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X)

word_to_id = tok.word_index
X_ids = tok.texts_to_sequences(X)
X_ids_pad = sequence.pad_sequences(X_ids,maxlen=max_len)

出力tok.png

--Display the number of words in each message as a histogram
The maximum is about 100 words, and Spam is often longer.

message_len = [len(v) for v in X_ids]
df['message_len']=message_len

plt.figure(figsize=(12, 8))

df[df.v1=='ham'].message_len.plot(bins=35, kind='hist', color='blue', 
                                       label='Ham messages', alpha=0.6)
df[df.v1=='spam'].message_len.plot(kind='hist', color='red', 
                                       label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")

ヒストグラム.png

Model implementation

--Definition of sigmoid function, softmax function, cross_entropy_error function
No change from text.

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

    return x


def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    #Teacher data is one-hot-In case of vector, convert to index of correct label
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]

    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

--Definition of each layer of Affine, Softmax, SoftmaxWithLoss, Embedding
No change from text.

class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        out = np.dot(x, W) + b
        self.x = x
        return out

    def backward(self, dout):
        W, b = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx


class Softmax:
    def __init__(self):
        self.params, self.grads = [], []
        self.out = None

    def forward(self, x):
        self.out = softmax(x)
        return self.out

    def backward(self, dout):
        dx = self.out * dout
        sumdx = np.sum(dx, axis=1, keepdims=True)
        dx -= self.out * sumdx
        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.y = None  #softmax output
        self.t = None  #Teacher label

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        #Teacher label is one-For hot vector, convert to correct index
        if self.t.size == self.y.size:
            self.t = self.t.argmax(axis=1)

        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = self.y.copy()
        dx[np.arange(batch_size), self.t] -= 1
        dx *= dout
        dx = dx / batch_size

        return dx

class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)
        return None

--Definition of each layer of TimeEmbedding, LSTM, TimeLSTM
No change from text.

class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = None
        self.W = W

    def forward(self, xs):
        N, T = xs.shape
        V, D = self.W.shape

        out = np.empty((N, T, D), dtype='f')
        self.layers = []

        for t in range(T):
            layer = Embedding(self.W)
            out[:, t, :] = layer.forward(xs[:, t])
            self.layers.append(layer)

        return out

    def backward(self, dout):
        N, T, D = dout.shape

        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:, t, :])
            grad += layer.grads[0]

        self.grads[0][...] = grad
        return None


class LSTM:
    def __init__(self, Wx, Wh, b):
        '''

        Parameters
        ----------
        Wx:input`x`Weight parameter for (combines 4 weights)
        Wh:Hidden state`h`Weight parameter for (summarize the weights for 4)
        b:Bias (summarize 4 biases)
        '''
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev, c_prev):
        Wx, Wh, b = self.params
        N, H = h_prev.shape

        A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b

        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]

        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(o)

        c_next = f * c_prev + g * i
        h_next = o * np.tanh(c_next)

        self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
        return h_next, c_next

    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache

        tanh_c_next = np.tanh(c_next)

        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

        dc_prev = ds * f

        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i

        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= (1 - g ** 2)

        dA = np.hstack((df, dg, di, do))

        dWh = np.dot(h_prev.T, dA)
        dWx = np.dot(x.T, dA)
        db = dA.sum(axis=0)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        dx = np.dot(dA, Wx.T)
        dh_prev = np.dot(dA, Wh.T)

        return dx, dh_prev, dc_prev


class TimeLSTM:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None

        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        H = Wh.shape[0]

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')

        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h

            self.layers.append(layer)

        return hs

    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]

        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0

        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
            dxs[:, t, :] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad

        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh
        return dxs

    def set_state(self, h, c=None):
        self.h, self.c = h, c

    def reset_state(self):
        self.h, self.c = None, None

--Definition of Rnnlm class
The hidden state vector h that appears at the end is Affine-transformed, binarized, and normalized by the Softmax function.

class Rnnlm():
    def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100, out_size=2):
        V, D, H, O = vocab_size, wordvec_size, hidden_size, out_size
        rn = np.random.randn

        #Weight initialization
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, O) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(O).astype('f')

        #Layer generation
        self.embed_layer = TimeEmbedding(embed_W)
        self.lstm_layer = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine_layer = Affine(affine_W, affine_b)
        self.loss_layer = SoftmaxWithLoss()
        self.softmax_layer = Softmax()

        #List all weights and gradients
        self.params = self.embed_layer.params + self.lstm_layer.params + self.affine_layer.params
        self.grads = self.embed_layer.grads + self.lstm_layer.grads + self.affine_layer.grads

    def predict(self, xs):
        self.reset_state()
        xs = self.embed_layer.forward(xs)
        hs = self.lstm_layer.forward(xs)
        xs = self.affine_layer.forward(hs[:,-1,:]) #Affine transformation of the last hidden layer
        score = self.softmax_layer.forward(xs)
        return score

    def forward(self, xs, t):
        xs = self.embed_layer.forward(xs)
        hs = self.lstm_layer.forward(xs)
        x = self.affine_layer.forward(hs[:,-1,:]) #Affine transformation of the last hidden layer
        loss = self.loss_layer.forward(x, t)
        self.hs = hs

        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        dhs = np.zeros_like(self.hs)
        dhs[:,-1,:] = self.affine_layer.backward(dout) #Set error back propagation of Affine transformation in the last hidden layer

        dout = self.lstm_layer.backward(dhs)
        dout = self.embed_layer.backward(dout)
        return dout

    def reset_state(self):
        self.lstm_layer.reset_state()

--Definition of SGD as Optimizer
No change from text

class SGD:
    '''
Stochastic Gradient Descent
    '''
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

Learn from here

--Separate data into training data (85%) and exam data (15%)

X_train,X_test,Y_train,Y_test = train_test_split(X_ids_pad,Y,test_size=0.15)

--Settings such as hyperparameters

#Hyperparameter settings
vocab_size = len(word_to_id)+1
batch_size = 20
wordvec_size = 100
hidden_size = 100
out_size = 2 #Binary problem of ham and spam
lr = 1.0
max_epoch = 10
data_size = len(X_train)

#Variables used during learning
max_iters = data_size // batch_size

#Need to convert to Numpy array
x = np.array(X_train)
t = np.array(Y_train)

--Learning --Process 20 messages at a time in a mini-batch --The Truncated BPTT in the text is not applied.

total_loss = 0
loss_count = 0
loss_list = []

#Model generation
model = Rnnlm(vocab_size, wordvec_size, hidden_size, out_size)
optimizer = SGD(lr)

for epoch in range(max_epoch):
    for iter in range(max_iters):
        #Get a mini batch
        batch_x  = x[iter*batch_size:(iter+1)*batch_size]
        batch_t =  t[iter*batch_size:(iter+1)*batch_size]

        #Find the gradient and update the parameters
        loss = model.forward(batch_x, batch_t)
        model.backward()
        optimizer.update(model.params, model.grads)
        total_loss += loss
        loss_count += 1

    avg_loss = total_loss / loss_count
    print("| epoch %d | loss %.5f" % (epoch+1, avg_loss))
    loss_list.append(float(avg_loss))
    total_loss, loss_count = 0,0
  
x = np.arange(len(loss_list))
plt.plot(x, loss_list, label='train')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.show()

出力3.png 学習.png

--Inference of test data

result = model.predict(X_test)
Y_pred = result.argmax(axis=1)

--Correct answer rate
98%!
Not bad compared to other Kaggle notebooks.

# calculate accuracy of class predictions
print('acc=',metrics.accuracy_score(Y_test, Y_pred))

出力4.png

--Confusion matrix

# print the confusion matrix
print(metrics.confusion_matrix(Y_test, Y_pred))

出力5.png

5. Summary

This time, I was able to deepen my understanding of the text by trial and error for creating this tool.

If you read Deep Learning ②, which is also made from scratch, we recommend that you use the sample program to create some kind of app.

bonus

Judgment by self-made sms
The first is to invite you to go see a baseball game together.
The second is self-made Spam (you don't have to translate it).
Surprisingly? I can judge it properly.

texts_add = ["I'd like to watch baseball game with you. I'm wating for your answer.",
    "Do you want to meet new sex partners every night? Feel free to call 09077xx0721."
    ]
X_ids_add = tok.texts_to_sequences(texts_add)
X_ids_pad_add = sequence.pad_sequences(X_ids_add,maxlen=max_len)

result = model.predict(X_ids_pad_add)
Y_pred = result.argmax(axis=1)
print(Y_pred)

出力おまけ.png

Application of Deep Learning 2 made from scratch Spam filter