Target person

The previous article is here

In this article, we have introduced in Activation function list and Gradient descent method list. Activators.py and [optimizers. py](https://qiita.com/kuroitu/items/36a58b37690d570dc618#%E5%AE%9F%E8%A3%85%E3%82%B3%E3%83%BC%E3%83%89%E4% Implement the functions get_act and get_opt to call BE% 8B). We will also introduce and implement the loss function currently in use.

-[Localization of activation function](# Localization of activation function) -[Localization of Gradient Descent](# Localization of Gradient Descent) -[Loss function](# loss function) -[Square error](#Square error) -[Bivalent cross entropy](# Binary cross entropy) -[Multi-value cross entropy](# Multi-value cross entropy) -[Loss function localization](# Loss function localization) -Conclusion

Localization of activation function

First, I will put the code body of the activation function.

activators.py

`activators.py`


import numpy as np


class Activator():
    def __init__(self, *args,**kwds):
        pass


    def forward(self, *args,**kwds):
        raise Exception("Not Implemented")


    def backward(self, *args,**kwds):
        raise Exception("Not Implemented")


    def update(self, *args,**kwds):
        pass


class step(Activator):
    def forward(self, x, *args,**kwds):
        return np.where(x > 0, 1, 0)


    def backward(self, x, *args,**kwds):
        return np.zeros_like(x)


class identity(Activator):
    def forward(self, x, *args,**kwds):
        return x


    def backward(self, x, *args,**kwds):
        return np.ones_like(x)


class bentIdentity(Activator):
    def forward(self, x, *args,**kwds):
        return 0.5*(np.sqrt(x**2 + 1) - 1) + x


    def backward(self, x, *args,**kwds):
        return 0.5*x/np.sqrt(x**2 + 1) + 1


class hardShrink(Activator):
    def __init__(self, lambda_=0.5, *args,**kwds):
        self.lambda_ = lambda_
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where((-self.lambda_ <= x) & (x <= self.lambda_),
                        0, x)


    def backward(self, x, *args,**kwds):
        return np.where((-self.lambda_ <= x) & (x <= self.lambda_),
                        0, 1)


class softShrink(Activator):
    def __init__(self, lambda_=0.5, *args,**kwds):
        self.lambda_ = lambda_
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where(x < -self.lambda_, x + self.lambda_,
                        np.where(x > self.lambda_, x - self.lambda_, 0))


    def backward(self, x, *args,**kwds):
        return np.where((-self.lambda_ <= x) & (x <= self.lambda_),
                        0, 1)


class threshold(Activator):
    def __init__(self, threshold, value, *args,**kwds):
        self.threshold = threshold
        self.value = value
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where(x > self.threshold, x, self.value)


    def backward(self, x, *args,**kwds):
        return np.where(x > self.threshold, 1, 0)


class sigmoid(Activator):
    def forward(self, x, *args,**kwds):
        return 1/(1 + np.exp(-x))


    def backward(self, x, y, *args,**kwds):
        return y*(1 - y)


class hardSigmoid(Activator):
    def forward(self, x, *args,**kwds):
        return np.clip(0.2*x + 0.5, 0, 1)


    def backward(self, x, *args,**kwds):
        return np.where((x > 2.5) | (x < -2.5), 0, 0.2)


class logSigmoid(Activator):
    def forward(self, x, *args,**kwds):
        return -np.log(1 + np.exp(-x))


    def backward(self, x, *args,**kwds):
        return 1/(1 + np.exp(x))


class act_tanh(Activator):
    def forward(self, x, *args,**kwds):
        return np.tanh(x)


    def backward(self, x, *args,**kwds):
        return 1 - np.tanh(x)**2


class hardtanh(Activator):
    def forward(self, x, *args,**kwds):
        return np.clip(x, -1, 1)


    def backward(self, x, *args,**kwds):
        return np.where((-1 <= x) & (x <= 1), 1, 0)


class tanhShrink(Activator):
    def forward(self, x, *args,**kwds):
        return x - np.tanh(x)


    def backward(self, x, *args,**kwds):
        return np.tanh(x)**2


class ReLU(Activator):
    def forward(self, x, *args,**kwds):
        return np.maximum(0, x)


    def backward(self, x, *args,**kwds):
        return np.where(x > 0, 1, 0)


class ReLU6(Activator):
    def forward(self, x, *args,**kwds):
        return np.clip(x, 0, 6)


    def backward(self, x, *args,**kwds):
        return np.where((0 < x) & (x < 6), 1, 0)


class leakyReLU(Activator):
    def __init__(self, alpha=1e-2, *args,**kwds):
        self.alpha = alpha
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.maximum(self.alpha * x, x)


    def backward(self, x, *args,**kwds):
        return np.where(x < 0, self.alpha, 1)


class ELU(Activator):
    def __init__(self, alpha=1., *args,**kwds):
        self.alpha = alpha
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where(x >= 0, x, self.alpha*(np.exp(x) - 1))


    def backward(self, x, *args,**kwds):
        return np.where(x >= 0, 1, self.alpha*np.exp(x))


class SELU(Activator):
    def __init__(self, lambda_=1.0507, alpha=1.67326, *args,**kwds):
        self.lambda_ = lambda_
        self.alpha = alpha
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where(x >= 0,
                        self.lambda_*x,
                        self.lambda_*self.alpha*(np.exp(x) - 1))


    def backward(self, x, *args,**kwds):
        return np.where(x >= 0, 
                        self.lambda_,
                        self.lambda_*self.alpha*np.exp(x))


class CELU(Activator):
    def __init__(self, alpha=1., *args,**kwds):
        self.alpha = alpha
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return np.where(x >= 0,
                        x,
                        self.alpha*(np.exp(x/self.alpha) - 1))


    def backward(self, x, *args,**kwds):
        return np.where(x >= 0, 1, np.exp(x/self.alpha))


class softmax(Activator):
    def forward(self, x, *args,**kwds):
        return np.exp(x)/np.sum(np.exp(x))


    def backward(self, x, *args,**kwds):
        return np.exp(x)*(np.sum(np.exp(x)) 
                          - np.exp(x))/np.sum(np.exp(x))**2


class softmin(Activator):
    def forward(self, x, *args,**kwds):
        return np.exp(-x)/np.sum(np.exp(-x))


    def backward(self, x, *args,**kwds):
        return -(np.exp(x)*(np.sum(np.exp(-x)) - np.exp(x))
                 /np.sum(np.exp(-x))**2)


class logSoftmax(Activator):
    def forward(self, x, *args,**kwds):
        return np.log(np.exp(x)/np.sum(np.exp(x)))


    def backward(self, x, *args,**kwds):
        y = np.sum(np.exp(x))
        return (y - np.exp(x))/y


class softplus(Activator):
    def forward(self, x, *args,**kwds):
        return np.logaddexp(x, 0)


    def backward(self, x, *args,**kwds):
        return 1/(1 + np.exp(-x))


class softsign(Activator):
    def forward(self, x, *args,**kwds):
        return x/(1 + np.abs(x))


    def backward(self, x, *args,**kwds):
        return 1/(1 + np.abs(x)) ** 2


class Swish(Activator):
    def __init__(self, beta=1, *args,**kwds):
        self.beta = beta
        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        return x/(1 + np.exp(-self.beta*x))


    def backward(self, x, y, *args,**kwds):
        return self.beta*y + (1 - self.beta*y)/(1 + np.exp(-self.beta*x))


    def d2y(self, x, *args,**kwds):
        return (-0.25*self.beta*(self.beta*x*np.tanh(0.5*self.beta*x) - 2)
                               *(1 - np.tanh(0.5*self.beta*x)**2))


class Mish(Activator):
    def forward(self, x, *args,**kwds):
        return x*np.tanh(np.logaddexp(x, 0))


    def backward(self, x, *args,**kwds):
        omega = (4*(x + 1) + 4*np.exp(2*x) 
                 + np.exp(3*x) + (4*x + 6)*np.exp(x))
        delta = 2*np.exp(x) + np.exp(2*x) + 2
        return np.exp(x)*omega/delta**2


    def d2y(self, x, *args,**kwds):
        omega = (2*(x + 2) 
                 + np.exp(x)*(np.exp(x)*(-2*np.exp(x)*(x - 1) - 3*x + 6)
                              + 2*(x + 4)))
        delta = np.exp(x)*(np.exp(x) + 2) + 2
        return 4*np.exp(x)*omega/delta**3


class tanhExp(Activator):
    def forward(self, x, *args,**kwds):
        return x*np.tanh(np.exp(x))


    def backward(self, x, *args,**kwds):
        tanh_exp = np.tanh(np.exp(x))
        return tanh_exp - x*np.exp(x)*(tanh_exp**2 - 1)


    def d2y(self, x, *args,**kwds):
        tanh_exp = np.tanh(np.exp(x))
        return (np.exp(x)*(-x + 2*np.exp(x)*x*tanh_exp - 2)
                         *(tanh_exp**2 - 1))


class maxout(Activator):
    def __init__(self, n_prev, n, k, wb_width=5e-2, *args,**kwds):
        self.n_prev = n_prev
        self.n = n
        self.k = k
        self.w = wb_width*np.random.rand((n_prev, n*k))
        self.b = wb_width*np.random.rand(n*k)

        super().__init__(*args,**kwds)


    def forward(self, x, *args,**kwds):
        self.x = x.copy()
        self.z = np.dot(self.w.T, x) + self.b
        self.z = self.z.reshape(self.n, self.k)
        self.y = np.max(self.z, axis=1)
        return self.y

    def backward(self, g, *args,**kwds):
        self.dw = np.sum(np.dot(self.w, self.x))

Define a function to call this. The policy is to define a dictionary with an instance of the class and call it from the dictionary with the `get_act` function.

get_act.py

`get_act.py`


_act_dic = {"step": step,
            "identity": identity,
            "bent-identity": bentIdentity,
            "hard-shrink": hardShrink,
            "soft-shrink": softShrink,
            "threshold": threshold,
            "sigmoid": sigmoid,
            "hard-sigmoid": hardSigmoid,
            "log-sigmoid": logSigmoid,
            "tanh": act_tanh,
            "tanh-shrink": tanhShrink,
            "hard-tanh":hardtanh,
            "ReLU": ReLU,
            "ReLU6": ReLU6,
            "leaky-ReLU": leakyReLU,
            "ELU": ELU,
            "SELU": SELU,
            "CELU": CELU,
            "softmax": softmax,
            "softmin": softmin,
            "log-softmax": logSoftmax,
            "softplus": softplus,
            "softsign": softsign,
            "Swish": Swish,
            "Mish": Mish,
            "tanhExp": tanhExp,
           }


def get_act(name, *args,**kwds):
    if name in _act_dic.keys():
        activator = _act_dic[name](*args,**kwds)
    else:
        raise ValueError(name, ": Unknown activator")
    
    return activator

It's easy.

Localization of gradient descent

Next is the localization of the gradient descent method. The method is the same.

optimizers.py

`optimizers.py`


import numpy as np


class Optimizer():
    """
A superclass inherited by the optimization method.
    """
    def __init__(self, *args,**kwds):
        pass


    def update(self, *args,**kwds):
        pass


class SGD(Optimizer):
    def __init__(self, eta=1e-2, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta


    def update(self, grad_w, grad_b, *args,**kwds):
        dw = -self.eta*grad_w
        db = -self.eta*grad_b
        return dw, db


class MSGD(Optimizer):
    def __init__(self, eta=1e-2, mu=0.9, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta
        self.mu = mu

        #Hold the value of the previous step
        self.dw = 0
        self.db = 0


    def update(self, grad_w, grad_b, *args,**kwds):
        dw = self.mu*self.dw - (1-self.mu)*self.eta*grad_w
        db = self.mu*self.db - (1-self.mu)*self.eta*grad_b

        #Assigning in the view instead of copying is because these values may be used
        #This is because it will not be changed.
        self.dw = dw
        self.db = db

        return dw, db


class NAG(Optimizer):
    def __init__(self, eta=1e-2, mu=0.9, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta
        self.mu = mu

        #Holds the value of the previous step
        self.dw = 0
        self.db = 0


    def update(self, grad_w, grad_b, w=0, b=0, dfw=None, dfb=None,
               nargs=2, *args,**kwds):
        if nargs == 1:
            grad_w = dfw(w + self.mu*self.dw)
            grad_b = 0
        elif nargs == 2:
            grad_w = dfw(w + self.mu*self.dw, b + self.mu*self.db)
            grad_b = dfb(w + self.mu*self.dw, b + self.mu*self.db)

        dw = self.mu*self.dw - (1-self.mu)*self.eta*grad_w
        db = self.mu*self.db - (1-self.mu)*self.eta*grad_b

        #Assigning in the view instead of copying is because these values may be used
        #This is because it will not be changed.
        self.dw = dw
        self.db = db

        return dw, db


class AdaGrad(Optimizer):
    def __init__(self, eta=1e-3, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta

        #Hold the value of the previous step
        self.gw = 0
        self.gb = 0


    def update(self, grad_w, grad_b, *args,**kwds):
        self.gw += grad_w*grad_w
        self.gb += grad_b*grad_b

        dw = -self.eta*grad_w/np.sqrt(self.gw)
        db = -self.eta*grad_b/np.sqrt(self.gb)

        return dw, db


class RMSprop(Optimizer):
    def __init__(self, eta=1e-2, rho=0.99, eps=1e-8, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta
        self.rho = rho
        self.eps = eps

        #Hold the value of the previous step
        self.vw = 0
        self.vb = 0


    def update(self, grad_w, grad_b, *args,**kwds):
        self.vw += (1-self.rho)*(grad_w**2 - self.vw)
        self.vb += (1-self.rho)*(grad_b**2 - self.vb)

        dw = -self.eta*grad_w/np.sqrt(self.vw+self.eps)
        db = -self.eta*grad_b/np.sqrt(self.vb+self.eps)

        return dw, db


class AdaDelta(Optimizer):
    def __init__(self, rho=0.95, eps=1e-6, *args,**kwds):
        super().__init__(*args,**kwds)

        self.rho = rho
        self.eps = eps

        #Hold the value of the previous step
        self.vw = 0
        self.vb = 0
        self.uw = 0
        self.ub = 0


    def update(self, grad_w, grad_b, *args,**kwds):
        self.vw += (1-self.rho)*(grad_w**2 - self.vw)
        self.vb += (1-self.rho)*(grad_b**2 - self.vb)

        dw = -grad_w*np.sqrt(self.uw+self.eps)/np.sqrt(self.vw+self.eps)
        db = -grad_b*np.sqrt(self.ub+self.eps)/np.sqrt(self.vb+self.eps)

        self.uw += (1-self.rho)*(dw**2 - self.uw)
        self.ub += (1-self.rho)*(db**2 - self.ub)

        return dw, db


class Adam(Optimizer):
    def __init__(self, alpha=1e-3, beta1=0.9, beta2=0.999, eps=1e-8,
                 *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)
        self.vw += (1-self.beta2)*(grad_w**2 - self.vw)
        self.vb += (1-self.beta2)*(grad_b**2 - self.vb)

        alpha_t = self.alpha*np.sqrt(1-self.beta2**t)/(1-self.beta1**t)

        dw = -alpha_t*self.mw/(np.sqrt(self.vw+self.eps))
        db = -alpha_t*self.mb/(np.sqrt(self.vb+self.eps))

        return dw, db


class RMSpropGraves(Optimizer):
    def __init__(self, eta=1e-4, rho=0.95, eps=1e-4, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta
        self.rho = rho
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0


    def update(self,grad_w, grad_b, *args,**kwds):
        self.mw += (1-self.rho)*(grad_w - self.mw)
        self.mb += (1-self.rho)*(grad_b - self.mb)
        self.vw += (1-self.rho)*(grad_w**2 - self.vw)
        self.vb += (1-self.rho)*(grad_b**2 - self.vb)

        dw = -self.eta*grad_w/np.sqrt(self.vw - self.mw**2 + self.eps)
        db = -self.eta*grad_b/np.sqrt(self.vb - self.mb**2 + self.eps)

        return dw, db


class SMORMS3(Optimizer):
    def __init__(self, eta=1e-3, eps=1e-8, *args,**kwds):
        super().__init__(*args,**kwds)

        self.eta = eta
        self.eps = eps

        #Hold the value of the previous step
        self.zetaw = 0
        self.zetab = 0
        self.sw = 1
        self.sb = 1
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0


    def update(self, grad_w, grad_b, *args,**kwds):
        rhow = 1/(1+self.sw)
        rhob = 1/(1+self.sb)

        self.mw += (1-rhow)*(grad_w - self.mw)
        self.mb += (1-rhob)*(grad_b - self.mb)
        self.vw += (1-rhow)*(grad_w**2 - self.vw)
        self.vb += (1-rhob)*(grad_b**2 - self.vb)

        self.zetaw = self.mw**2 / (self.vw + self.eps)
        self.zetaw = self.mb**2 / (self.vb + self.eps)

        dw = -grad_w*(np.minimum(self.eta, self.zetaw)
                      /np.sqrt(self.vw + self.eps))
        db = -grad_b*(np.minimum(self.eta, self.zetab)
                      /np.sqrt(self.vb + self.eps))

        self.sw = 1 + (1 - self.zetaw)*self.sw
        self.sb = 1 + (1 - self.zetab)*self.sb

        return dw, db


class AdaMax(Optimizer):
    def __init__(self, alpha=2e-3, beta1=0.9, beta2=0.999,
                 *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.uw = 0
        self.ub = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)
        self.uw = np.maximum(self.beta2*self.uw, np.abs(grad_w))
        self.ub = np.maximum(self.beta2*self.ub, np.abs(grad_b))

        alpha_t = self.alpha/(1 - self.beta1**t)

        dw = -alpha_t*self.mw/self.uw
        db = -alpha_t*self.mb/self.ub

        return dw, db


class Nadam(Optimizer):
    def __init__(self, alpha=2e-3, mu=0.975, nu=0.999, eps=1e-8,
                 *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.mu = mu
        self.nu = nu
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.mu)*(grad_w - self.mw)
        self.mb += (1-self.mu)*(grad_b - self.mb)
        self.vw += (1-self.nu)*(grad_w**2 - self.vw)
        self.vb += (1-self.nu)*(grad_b**2 - self.vb)

        mhatw = (self.mu*self.mw/(1-self.mu**(t+1))
                 + (1-self.mu)*grad_w/(1-self.mu**t))
        mhatb = (self.mu*self.mb/(1-self.mu**(t+1))
                 + (1-self.mu)*grad_b/(1-self.mu**t))
        vhatw = self.nu*self.vw/(1-self.nu**t)
        vhatb = self.nu*self.vb/(1-self.nu**t)

        dw = -self.alpha*mhatw/np.sqrt(vhatw + self.eps)
        db = -self.alpha*mhatb/np.sqrt(vhatb + self.eps)

        return dw, db


class Eve(Optimizer):
    def __init__(self, alpha=1e-3, beta1=0.9, beta2=0.999, beta3=0.999,
                 c=10, eps=1e-8, fstar=0, *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.beta3 = beta3
        self.c = c
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0
        self.f = 0
        self.fstar = fstar
        self.dtilde_w = 0
        self.dtilde_b = 0


    def update(self, grad_w, grad_b, t=1, f=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)
        self.vw += (1-self.beta2)*(grad_w**2 - self.vw)
        self.vb += (1-self.beta2)*(grad_b**2 - self.vb)

        mhatw = self.mw/(1 - self.beta1**t)
        mhatb = self.mb/(1 - self.beta1**t)
        vhatw = self.vw/(1 - self.beta2**t)
        vhatb = self.vb/(1 - self.beta2**t)

        if t > 1:
            d_w = (np.abs(f-self.fstar)
                    /(np.minimum(f, self.f) - self.fstar))
            d_b = (np.abs(f-self.fstar)
                    /(np.minimum(f, self.f) - self.fstar))
            dhat_w = np.clip(d_w, 1/self.c, self.c)
            dhat_b = np.clip(d_b, 1/self.c, self.c)
            self.dtilde_w += (1 - self.beta3)*(dhat_w - self.dtilde_w)
            self.dtilde_b += (1 - self.beta3)*(dhat_b - self.dtilde_b)
        else:
            self.dtilde_w = 1
            self.dtilde_b = 1

        self.f = f

        dw = -(self.alpha*mhatw
               /(self.dtilde_w*(np.sqrt(vhatw) + self.eps)))
        db = -(self.alpha*mhatb
               /(self.dtilde_b*(np.sqrt(vhatb) + self.eps)))

        return dw, db


class SantaE(Optimizer):
    def __init__(self, eta=1e-2, sigma=0.95, lambda_=1e-8,
                 anne_func=lambda t, n: t**n, anne_rate=0.5,
                 burnin=100, C=5, N=16,
                 *args,**kwds):
        """
        Args:
            eta: Learning rate
            sigma: Maybe in other cases;
                    'rho' in RMSprop, AdaDelta, RMSpropGraves.
                    'rhow' or 'rhob' in SMORMS3.
                    'beta2' in Adam, Eve.
                    'nu' in Nadam.
                   To use calculation 'v'.
            lambda_: Named 'eps'(ε) in other cases.
            anne_func: Annealing function.
                       To use calculation 'beta' at each timestep.
                       Default is 'timestep'**'annealing rate'.
                       The calculated value should be towards infinity
                       as 't' increases.
            anne_rate: Annealing rate.
                       To use calculation 'beta' at each timestep.
                       The second Argument of 'anne_func'.
            burnin: Swith exploration and refinement.
                    This should be specified by users.
            C: To calculate first 'alpha'.
            N: Number of minibatch.
        """
        super().__init__(*args,**kwds)

        self.eta = eta
        self.sigma = sigma
        self.lambda_ = lambda_
        self.anne_func = anne_func
        self.anne_rate = anne_rate
        self.burnin = burnin
        self.N = N

        # Keep one step before and Initialize.
        self.alpha_w = np.sqrt(eta)*C
        self.alpha_b = np.sqrt(eta)*C
        self.vw = 0
        self.vb = 0
        self.gw = 0
        self.gb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        try:
            shape_w = grad_w.shape
        except:
            shape_w = (1, )
        try:
            shape_b = grad_b.shape
        except:
            shape_b = (1, )

        if t == 1:
            # Initialize uw, ub.
            self.uw = np.sqrt(self.eta)*np.random.randn(*shape_w)
            self.ub = np.sqrt(self.eta)*np.random.randn(*shape_b)

        self.vw = (self.sigma*self.vw
                   + grad_w*grad_w * (1 - self.sigma) / self.N**2)
        self.vb = (self.sigma*self.vb
                   + grad_b*grad_b * (1 - self.sigma) / self.N**2)

        gw = 1/np.sqrt(self.lambda_ + np.sqrt(self.vw))
        gb = 1/np.sqrt(self.lambda_ + np.sqrt(self.vb))

        beta = self.anne_func(t, self.anne_rate)
        if t < self.burnin:
            # Exploration.
            self.alpha_w += self.uw*self.uw - self.eta/beta
            self.alpha_b += self.ub*self.ub - self.eta/beta

            uw = (self.eta/beta * (1 - self.gw/gw)/self.uw
                  + np.sqrt(2*self.eta/beta * self.gw)
                  * np.random.randn(*shape_w))
            ub = (self.eta/beta * (1 - self.gb/gb)/self.ub
                  + np.sqrt(2*self.eta/beta * self.gb)
                  * np.random.randn(*shape_b))
        else:
            # Refinement.
            uw = 0
            ub = 0

        uw += (1 - self.alpha_w)*self.uw - self.eta*gw*grad_w
        ub += (1 - self.alpha_b)*self.ub - self.eta*gb*grad_b

        # Update values.
        self.uw = uw
        self.ub = ub
        self.gw = gw
        self.gb = gb

        dw = gw*uw
        db = gb*ub

        return dw, db


class SantaSSS(Optimizer):
    def __init__(self, eta=1e-2, sigma=0.95, lambda_=1e-8,
                 anne_func=lambda t, n: t**n, anne_rate=0.5,
                 burnin=100, C=5, N=16,
                 *args,**kwds):
        """
        Args:
            eta: Learning rate
            sigma: Maybe in other cases;
                    'rho' in RMSprop, AdaDelta, RMSpropGraves.
                    'rhow' or 'rhob' in SMORMS3.
                    'beta2' in Adam, Eve.
                    'nu' in Nadam.
                   To use calculation 'v'.
            lambda_: Named 'eps'(ε) in other cases.
            anne_func: Annealing function.
                       To use calculation 'beta' at each timestep.
                       Default is 'timestep'**'annealing rate'.
                       The calculated value should be towards infinity
                       as 't' increases.
            anne_rate: Annealing rate.
                       To use calculation 'beta' at each timestep.
                       The second Argument of 'anne_func'.
            burnin: Swith exploration and refinement.
                    This should be specified by users.
            C: To calculate first 'alpha'.
            N: Number of minibatch.
        """
        super().__init__(*args,**kwds)

        self.eta = eta
        self.sigma = sigma
        self.lambda_ = lambda_
        self.anne_func = anne_func
        self.anne_rate = anne_rate
        self.burnin = burnin
        self.N = N

        # Keep one step before and Initialize.
        self.alpha_w = np.sqrt(eta)*C
        self.alpha_b = np.sqrt(eta)*C
        self.vw = 0
        self.vb = 0
        self.gw = 0
        self.gb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        try:
            shape_w = grad_w.shape
        except:
            shape_w = (1, )
        try:
            shape_b = grad_b.shape
        except:
            shape_b = (1, )

        if t == 1:
            # Initialize uw, ub.
            self.uw = np.sqrt(self.eta)*np.random.randn(*shape_w)
            self.ub = np.sqrt(self.eta)*np.random.randn(*shape_b)

        self.vw = (self.sigma*self.vw
                   + grad_w*grad_w * (1 - self.sigma) / self.N**2)
        self.vb = (self.sigma*self.vb
                   + grad_b*grad_b * (1 - self.sigma) / self.N**2)

        gw = 1/np.sqrt(self.lambda_ + np.sqrt(self.vw))
        gb = 1/np.sqrt(self.lambda_ + np.sqrt(self.vb))

        dw = 0.5*gw*self.uw
        db = 0.5*gb*self.ub

        beta = self.anne_func(t, self.anne_rate)
        if t < self.burnin:
            # Exploration.
            self.alpha_w += (self.uw*self.uw - self.eta/beta)*0.5
            self.alpha_b += (self.ub*self.ub - self.eta/beta)*0.5

            uw = np.exp(-0.5*self.alpha_w)*self.uw
            ub = np.exp(-0.5*self.alpha_b)*self.ub
            uw += (-gw*grad_w*self.eta
                        + np.sqrt(2*self.gw*self.eta/beta)
                        * np.random.randn(*shape_w)
                        + self.eta/beta*(1-self.gw/gw)/self.uw)
            ub += (-gb*grad_b*self.eta
                        + np.sqrt(2*self.gb*self.eta/beta)
                        * np.random.randn(*shape_b)
                        + self.eta/beta*(1-self.gb/gb)/self.ub)
            uw *= np.exp(-0.5*self.alpha_w)
            ub *= np.exp(-0.5*self.alpha_b)

            self.alpha_w += (uw*uw - self.eta/beta)*0.5
            self.alpha_b += (ub*ub - self.eta/beta)*0.5
        else:
            # Refinement.
            uw = np.exp(-0.5*self.alpha_w)*self.uw
            ub = np.exp(-0.5*self.alpha_b)*self.ub

            uw -= gw*grad_w*self.eta
            ub -= gb*grad_b*self.eta

            uw *= np.exp(-0.5*self.alpha_w)
            ub *= np.exp(-0.5*self.alpha_b)

        # Update values.
        self.uw = uw
        self.ub = ub
        self.gw = gw
        self.gb = gb

        dw = gw*uw*0.5
        db = gb*ub*0.5

        return dw, db


class AMSGrad(Optimizer):
    def __init__(self, alpha=1e-3, beta1=0.9, beta2=0.999, eps=1e-8,
                 *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0
        self.vhatw = 0
        self.vhatb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)

        self.vw += (1-self.beta2)*(grad_w**2 - self.vw)
        self.vb += (1-self.beta2)*(grad_b**2 - self.vb)

        self.vhatw = np.maximum(self.vhatw, self.vw)
        self.vhatb = np.maximum(self.vhatb, self.vb)

        alpha_t = self.alpha / np.sqrt(t)

        dw = - alpha_t * self.mw/np.sqrt(self.vhatw + self.eps)
        db = - alpha_t * self.mb/np.sqrt(self.vhatb + self.eps)

        return dw, db


class AdaBound(Optimizer):
    def __init__(self, alpha=1e-3, eta=1e-1, beta1=0.9, beta2=0.999,
                 eps=1e-8, *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.eta = eta
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)
        self.vw += (1-self.beta2)*(grad_w**2 - self.vw)
        self.vb += (1-self.beta2)*(grad_b**2 - self.vb)

        etal = self.eta*(1 - 1/((1-self.beta2)*t + 1))
        etau = self.eta*(1 + 1/((1-self.beta2)*t + self.eps))

        etahatw_t = np.clip(self.alpha/np.sqrt(self.vw), etal, etau)
        etahatb_t = np.clip(self.alpha/np.sqrt(self.vb), etal, etau)

        etaw_t = etahatw_t/np.sqrt(t)
        etab_t = etahatb_t/np.sqrt(t)

        dw = - etaw_t*self.mw
        db = - etab_t*self.mb

        return dw, db


class AMSBound(Optimizer):
    def __init__(self, alpha=1e-3, eta=1e-1, beta1=0.9, beta2=0.999,
                 eps=1e-8, *args,**kwds):
        super().__init__(*args,**kwds)

        self.alpha = alpha
        self.eta = eta
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        #Hold the value of the previous step
        self.mw = 0
        self.mb = 0
        self.vw = 0
        self.vb = 0
        self.vhatw = 0
        self.vhatb = 0


    def update(self, grad_w, grad_b, t=1, *args,**kwds):
        self.mw += (1-self.beta1)*(grad_w - self.mw)
        self.mb += (1-self.beta1)*(grad_b - self.mb)
        self.vw += (1-self.beta2)*(grad_w**2 - self.vw)
        self.vb += (1-self.beta2)*(grad_b**2 - self.vb)
        self.vhatw = np.maximum(self.vhatw, self.vw)
        self.vhatb = np.maximum(self.vhatb, self.vb)

        etal = self.eta*(1 - 1/((1-self.beta2)*t + 1))
        etau = self.eta*(1 + 1/((1-self.beta2)*t + self.eps))

        etahatw_t = np.clip(self.alpha/np.sqrt(self.vhatw), etal, etau)
        etahatb_t = np.clip(self.alpha/np.sqrt(self.vhatb), etal, etau)

        etaw_t = etahatw_t/np.sqrt(t)
        etab_t = etahatb_t/np.sqrt(t)

        dw = - etaw_t*self.mw
        db = - etab_t*self.mb

        return dw, db

get_opt.py

`get_opt.py`


_opt_dic = {
    "SDG": SGD,
    "MSGD": MSGD,
    "NAG": NAG,
    "AdaGrad": AdaGrad,
    "RMSprop": RMSprop,
    "AdaDelta": AdaDelta,
    "Adam": Adam,
    "RMSpropGraves": RMSpropGraves,
    "SMORMS3": SMORMS3,
    "AdaMax": AdaMax,
    "Nadam": Nadam,
    "Eve": Eve,
    "SantaE": SantaE,
    "SantaSSS": SantaSSS,
    "AMSGrad": AMSGrad,
    "AdaBound": AdaBound,
    "AMSBound": AMSBound,
}


def get_opt(name, *args,**kwds):
    if name in _opt_dic.keys():
        optimizer = _opt_dic[name](*args,**kwds)
    else:
        raise ValueError(name, ": Unknown optimizer")
    
    return optimizer

This is the end of localization.

Loss function

First of all, what is the loss function? Start from. Speaking of deep learning, the purpose is to approximate a certain objective function with a neural network. This is actually the same for problems such as image recognition where the objective function is not clear. For example, in the case of handwritten digit recognition, the number should be output (to be exact, a vector called one-hot expression) as a result of inputting an image and performing some processing. As for this objective function, of course we are less likely to know the exact function. Even with the number recognition that humans normally perform, it is an unclear issue what kind of processing is used for recognition. Therefore, as it is, there is no index for advancing learning. With this, how to learn and whether the policy you are learning is correct is refreshing. It's like studying without a goal. However, unless you know the exact solution of the objective function, you cannot make a difference from the objective function. This is where the concept of ** loss function ** comes into play. In a word, the learning index is not "how close to the objective function" but "how far it is from the objective function". Taking supervised learning as an example, we measure how far the output of the objective function (correct value) and the output of the approximate function (predicted value) are, and make the difference as close to zero as possible. think. The predicted value is calculated by ** forward propagation **, the error between the correct answer value and the predicted value is transmitted to each parameter for learning ** back propagation **, and the parameter is based on the gradient flowing in the back propagation. Is the ** learning rule ** to update. And the ** loss function is a function ** that determines the error to flow in the back propagation.

That's about it for the loss function. Let's see what kind of things are concrete.

Square error

First, I will introduce the ** squared error ** used in the linear open group problem.

\mathcal{L}(y) = \cfrac{1}{2} (y - t)^2

$ t $ is the correct answer and $ y $ is the predicted value. The above equation is a matrix representation. The reason for multiplying by 0.5 is that partial differentiation must be performed when back-propagating, and the coefficient is canceled at that time.

\cfrac{\partial \mathcal{L}}{\partial y} = \cfrac{1}{2} \times 2(y - t) = y - t

This makes it easy to calculate backpropagation. By the way, since this is a squared error due to backpropagation, the following ** mean squared error ** is used to judge whether the learning has converged.

E = \cfrac{1}{N}\sum_{i=1}^{N}{\mathcal{L}(y_i)} = \cfrac{1}{N}\sum_{i=1}^{N}{\cfrac{1}{2}(y_i - t_i)^2}

N is the number of data. When this value hardly changes, it means that the learning has converged (not guaranteed to be accurate enough). The implementation is as follows. The forward and backward functions are implemented as classes to treat them like layers in a sense.

errors.py

`errors.py`


class SquareError(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = 0.5 * (y - t)**2
        return self.error
    
    
    def backward(self, *args,**kwds):
        return self.t - self.t

![Square.png](https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/640911/63ef3eee-5536-2528-804c-0b348760d925.png)

Binary cross entropy

Next is ** binary cross entropy **. This is the error function fitted when the activation function of the output layer is the sigmoid function. In other words, it is a ** loss function ** used for binary classification problems.

\mathcal{L}(y) = - t \log y - (1 - t) \log(1 - y)

Differentiation is

\cfrac{\partial \mathcal{L}}{\partial y} = \cfrac{y - t}{y(1 - y)}

And the derivative of the sigmoid function appears in the denominator. Therefore, the gradient propagating through the output layer

\underbrace{\cfrac{y - t}{y(1 - y)}}_{Derivative of loss function} \times \underbrace{y(1 - y)}_{\textrm{Differentiation of sigmoid function}} = y - t

It will be a simple shape like.

errors.py

`errors.py`


class BinaryCrossEntropy(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = - t*np.log(y) - (1 - t)*np.log(1 - y)
        return self.error
    
    
    def backward(self, *args,**kwds):
        return (self.y - self.t) / (self.y*(1 - self.y))

![Binary.png](https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/640911/17709d5a-8ad9-3a9c-b080-2ae78cdf4bbe.png)

Cross entropy

Next is the ** cross entropy error ** used when using the softmax function as the activation function of the output layer in the ** multi-value classification problem **.

\mathcal{L}(y) = - t \log y

It can be said to be a general form of binary cross entropy. Differentiation is

\cfrac{\partial \mathcal{L}}{\partial y_i} = -\cfrac{t_i}{y_i}

However, when the softmax function is used as the activation function of the output layer, considering the partial differential with respect to the input $ x_i $,

\begin{align}
  \left( \cfrac{\partial \mathcal{L}}{\partial y} \times \cfrac{\partial y}{\partial x} \right)_i &= \sum_{j=1}^{n}{\left( \cfrac{\partial \mathcal{L}}{\partial y_j} \times \cfrac{\partial y_j}{\partial x_i} \right)} \\
  &= \sum_{j=1}^{n}{
\left\{ \begin{array}{ccc}
    -\cfrac{t_i}{y_i} \times y_i (1 - y_i) & = t_i y_i - t_i & (j=i) \\
    -\cfrac{t_j}{y_j} \times (-y_i y_j) &= t_j y_i & (j \ne i)
\end{array} \right\}
} \\
  &= \underbrace{(\underbrace{t_i y_i}^{With this} - t_i)}_{j=i} + \underbrace{y_i \sum_{j=1, j\ne i}^{n}{t_j}}_{j \ne i}^{To summarize this} \\
  &= \underbrace{y_i \sum_{j=1}^{n}{t_j}}^{Will be like this} - t_i \\
  &= y_i - t_i \quad (\because \textrm{one-hot}Because it's a vector\sum_{j=1}^{n}{t_j} =Become 1)
\end{align}

It will be a beautiful shape like. If it is a calculation graph Something like this. It's a complicated mystery ...

errors.py

`errors.py`


class CrossEntropy(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = - t*np.log(y)
        return self.error
    
    
    def backward(self, *args,**kwds):
        return - self.t/self.y

![Cross.png](https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/640911/e951c413-9d35-c3bf-b300-1e14e9608e2e.png)

Loss function localization

We will also localize the loss function.

errors.py

`errors.py`


import numpy as np


class Error():
    def __init__(self, *args,**kwds):
        self.error = 0
    
    
    def forward(self, *args,**kwds):
        pass
    
    
    def backward(self, *args,**kwds):
        pass
    
    
    def total_error(self, *args,**kwds):
        return np.sum(self.error)/self.error.size


class SquareError(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = 0.5 * (y - t)**2
        return self.error
    
    
    def backward(self, *args,**kwds):
        return self.y - self.t


class BinaryCrossEntropy(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = - t*np.log(y) - (1 - t)*np.log(1 - y)
        return self.error
    
    
    def backward(self, *args,**kwds):
        return (self.y - self.t) / (self.y*(1 - self.y))
    

class CrossEntropy(Error):
    def forward(self, y, t, *args,**kwds):
        self.y = y
        self.t = t
        self.error = - t*np.log(y)
        return self.error
    
    
    def backward(self, *args,**kwds):
        return - self.t/self.y

get_err.py

`get_err.py`


_err_dic = {"Square": SquareError,
            "Binary": BinaryCrossEntropy,
            "Cross": CrossEntropy,
           }


def get_err(name, *args,**kwds):
    if name in _err_dic.keys():
        errfunc = _err_dic[name](*args,**kwds)
    else:
        raise ValueError(name, ": Unknown error function")
    
    return errfunc

in conclusion

I wonder if I will summarize the loss functions used in other machine learning related things someday ... But before that, do we need to study other machine learning techniques?

Deep learning series

-Introduction to Deep Learning ~ Basics ~ -Introduction to Deep Learning ~ Coding Preparation ~ -Introduction to Deep Learning ~ Forward Propagation ~ -Introduction to Deep Learning ~ Backpropagation ~ -Introduction to Deep Learning ~ Learning Rules ~ -Introduction to Deep Learning ~ Localization and Loss Functions ~ -List of activation functions (2020) -Gradient descent method list (2020) -See and understand! Comparison of optimization methods (2020) -Thorough understanding of im2col -Complete understanding of numpy.pad function

Introduction to Deep Learning ~ Localization and Loss Function ~

Target person

table of contents

Localization of activation function

`activators.py`

`get_act.py`

Localization of gradient descent

`optimizers.py`

`get_opt.py`

Loss function

Square error

`errors.py`

Binary cross entropy

`errors.py`

Cross entropy

`errors.py`

Loss function localization

`errors.py`

`get_err.py`

in conclusion

Deep learning series