Adam Paper Summary and Code

Recently, I've been away from machine learning tasks (I've been writing Rails for a long time ... I'm about to return to the world of machine learning ...) I haven't read Adam's paper yet, so I read it and implemented it appropriately.

Adam Paper Summary

motivation I want to create a model that is easy to implement, has good computational efficiency, is memory-saving, is not easily affected by scale, and is adaptive to large-scale data / parameters.

Origin of Adam's name

Adaptive moment estimation

Advantages of Adam

algorithm

Experiment

Reference material

Reference code

I made a comparison with Adam, SGDNesterov, and lasso logistic regression. The data is https://www.kaggle.com/c/data-science-london-scikit-learn/data Was used. I'm using this data because I wanted to see if the performance would be good even with a small amount of data. Since the test data is not labeled, the training data is used separately for training and testing at 8: 2. It is not iterated in Adam, SGD Nesterov.

Sorry for the dirty code, but I will paste it below. Adam

python


# coding: utf-8
import numpy as np
import math
from itertools import izip
from sklearn.metrics import accuracy_score, recall_score


class Adam:
    def __init__(self, feat_dim, loss_type='log', alpha=0.001, beta1=0.9, beta2=0.999, epsilon=10**(-8)):
        self.weight = np.zeros(feat_dim)  # features weight
        self.loss_type = loss_type  # type of loss function
        self.feat_dim = feat_dim  # number of dimension
        self.x = np.zeros(feat_dim)  # feature
        self.m = np.zeros(feat_dim)  # 1st moment vector
        self.v = np.zeros(feat_dim)  # 2nd moment vector
        self.alpha = alpha  # step size
        self.beta1 = beta1  # Exponential decay rates for moment estimates
        self.beta2 = beta2  # Exponential decay rates for moment estimates
        self.epsilon = epsilon
        self.t = 1  # timestep

    def fit(self, data_fname, label_fname):
        with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
            for data, label in izip(f_data, f_label):
                self.features = np.array(data.rstrip().split(','), dtype=np.float64)
                y = int(-1) if int(label.rstrip())<=0 else int(1)  # posi=1, nega=-Unified to 1
                # update weight
                self.update(self.predict(self.features), y)
                self.t += 1
        return self.weight

    def predict(self, features): #margin
        return np.dot(self.weight, features)

    def calc_loss(self,m): # m=py=wxy
        if self.loss_type == 'hinge':
            return max(0,1-m)
        elif self.loss_type == 'log':
            # if m<=-700: m=-700
            return math.log(1+math.exp(-m))

    # gradient of loss function
    def calc_dloss(self,m): # m=py=wxy
        if self.loss_type == 'hinge':
            res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
            return res
        elif self.loss_type == 'log':
            if m < 0.0:
                return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
            else:
                ez = float( math.exp(-m) )
                return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx

    def update(self, pred, y):
        grad = y*self.calc_dloss(y*pred)*self.features  # gradient
        self.m = self.beta1*self.m + (1 - self.beta1)*grad  # update biased first moment estimate
        self.v = self.beta2*self.v + (1 - self.beta2)*grad**2  # update biased second raw moment estimate
        mhat = self.m/(1-self.beta1**self.t)  # compute bias-corrected first moment estimate
        vhat = self.v/(1-self.beta2**self.t)  # compute bias-corrected second raw moment estimate
        self.alpha *= np.sqrt(1-self.beta2**self.t)/(1-self.beta1**self.t)  # update stepsize
        self.weight -= self.alpha * mhat/(np.sqrt(vhat) + self.epsilon)  # update weight

if __name__=='__main__':
    data_fname = 'train800.csv'
    label_fname = 'trainLabels800.csv'
    test_data_fname = 'test200.csv'
    test_label_fname = 'testLabels200.csv'

    adam = Adam(40, loss_type='hinge')
    adam.fit(data_fname, label_fname)
    y_true = []
    y_pred = []
    with open(test_data_fname, 'r') as f_data, open(test_label_fname, 'r') as f_label:
        for data, label in izip(f_data, f_label):
            pred_label = adam.predict(np.array(data.rstrip().split(','), dtype=np.float64))
            y_true.append(int(label))
            y_pred.append( 1 if pred_label>0 else 0)
    print 'accuracy:', accuracy_score(y_true, y_pred)
    print 'recall:', recall_score(y_true, y_pred)

SGDNesterov

python


# coding: utf-8
import numpy as np
import math
from itertools import izip
from sklearn.metrics import accuracy_score, recall_score


class SgdNesterov:
    def __init__(self, feat_dim, loss_type='log', mu=0.9, learning_rate=0.5):
        self.weight = np.zeros(feat_dim)  # features weight
        self.loss_type = loss_type  # type of loss function
        self.feat_dim = feat_dim
        self.x = np.zeros(feat_dim)
        self.mu = mu  # momentum
        self.t = 1  # update times
        self.v = np.zeros(feat_dim)
        self.learning_rate = learning_rate

    def fit(self, data_fname, label_fname):
        with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
            for data, label in izip(f_data, f_label):
                self.features = np.array(data.rstrip().split(','), dtype=np.float64)
                y = int(-1) if int(label.rstrip())<=0 else int(1)  # posi=1, nega=-Unify to 1
                # update weight
                self.update(y)
                self.t += 1
        return self.weight

    def predict(self, features): #margin
        return np.dot(self.weight, features)

    def calc_loss(self,m): # m=py=wxy
        if self.loss_type == 'hinge':
            return max(0,1-m)
        elif self.loss_type == 'log':
            if m<=-700: m=-700
            return math.log(1+math.exp(-m))

    # gradient of loss function
    def calc_dloss(self,m): # m=py=wxy
        if self.loss_type == 'hinge':
            res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
            return res
        elif self.loss_type == 'log':
            if m < 0.0:
                return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
            else:
                ez = float( math.exp(-m) )
                return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx

    def update(self, y):
        w_ahead = self.weight + self.mu * self.v
        pred = np.dot(w_ahead, self.features)
        grad = y*self.calc_dloss(y*pred)*self.features # gradient
        self.v = self.mu * self.v - self.learning_rate * grad # velocity update stays the same
        # update weight
        self.weight += self.v


if __name__=='__main__':
    data_fname = 'train800.csv'
    label_fname = 'trainLabels800.csv'
    test_data_fname = 'test200.csv'
    test_label_fname = 'testLabels200.csv'

    sgd_n = SgdNesterov(40, loss_type='hinge')
    sgd_n.fit(data_fname, label_fname)
    y_true = []
    y_pred = []
    with open(test_data_fname, 'r') as f_data, open(test_label_fname, 'r') as f_label:
        for data, label in izip(f_data, f_label):
            pred_label = sgd_n.predict(np.array(data.rstrip().split(','), dtype=np.float64))
            y_true.append(int(label))
            y_pred.append( 1 if pred_label>0 else 0)
    print 'accuracy:', accuracy_score(y_true, y_pred)
    print 'recall:', recall_score(y_true, y_pred)

lasso logistic regression

python


import numpy as np
from itertools import izip
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score


def get_data(data_fname, label_fname):
    result_data = []
    result_labels = []
    with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
        for data, label in izip(f_data, f_label):
            result_data.append(data.rstrip().split(','))
            result_labels.append(int(label.rstrip()))
    return np.array(result_data, dtype=np.float64), result_labels

if __name__=='__main__':
     data_fname = 'train800.csv'
    label_fname = 'trainLabels800.csv'
    test_data_fname = 'test200.csv'
    test_label_fname = 'testLabels200.csv'

    data, labels = get_data(data_fname, label_fname)
    test_data, test_labels = get_data(test_data_fname, test_label_fname)

    lr = LogisticRegression()
    model = lr.fit(data, labels)
    y_pred = model.predict(test_data)
    print 'accuracy:', model.score(test_data, test_labels)
    print 'recall:', recall_score(test_labels, y_pred)

Experimental result

Adam (alpha automatic update)
accuracy: 0.685
recall: 0.777
Adam (fixed alpha)
accuracy: 0.815
recall: 0.809
SGDNesterov
accuracy: 0.755
recall: 0.755
lasso logistic regression
accuracy: 0.830
recall: 0.851

I thought that there was little data and it would be an obstacle to optimizing alpha, so I fixed alpha, and the accuracy improved. (I think that the optimization of alpha is for learning well with a deep model. I feel that it is correct to remove this time.) The vhat looks like this:

[  1.01440993   1.03180357   0.95435572   0.9297218   21.07682674
   0.94186528   4.65151802   5.00409033   0.99502491   1.04799237
   1.03563918   1.01860187  24.53366684   0.99717628   4.56930882
   0.99764606   0.95268578   1.00007278   4.94184457   0.96486898
   0.9665374    0.89604119   5.77110996  18.18369869   1.06281087
   0.98975868   1.01176115   1.06529464   5.55623853   5.52265492
   1.00727474   1.00094686   5.23052382   1.0256952    4.53388121
   1.0003947    5.4024963    0.98662918   4.86086664   4.4993808 ]
 [  0.70211545   0.70753131   0.68225521   0.65766954  14.23198314
   0.66457665   3.00986265   3.73453379   0.70920046   0.71507415
   0.7611441    0.71763729  12.45908405   0.71818535   2.44396968
   0.72608443   0.62573733   0.697053     3.06402831   0.64277643
   0.68346131   0.59957144   3.99612146  11.69024055   0.75532095
   0.68612789   0.69620363   0.75933189   3.41557243   4.05831119
   0.7255359    0.72140109   3.55049677   0.73630123   2.77828369
   0.69178571   3.82801224   0.68480352   3.70976494   2.96358695]

It may be necessary to change the handling of automatic update of alpha for each data, but I found that Adam seems to be strong (small average feeling) even with small data. Adam itself is already implemented in chainer etc., so I think that you should try compatibility with various data using it.

We apologize for the inconvenience, but we would appreciate it if you could point out any mistakes.

Recommended Posts

Adam Paper Summary and Code
[Memo] Test code summary
Code reduction-Pipeline and Function Transformer-
Quicksort details and code examples
Summary and common errors about cron
[Code] Module and Python version output
Python parallel / parallel processing sample code summary
Anomaly detection introduction and method summary
Summary of Python indexes and slices
[Python] Summary of conversion between character strings and numerical values (ascii code)