Experiment with various optimization algorithms with a neural network

The implementation is in dnn / optimizers.py of ** this GitHub repository **.

Introduction

I tried using the following learning coefficient optimization with a simple neural network using MNIST as a benchmark. Let's compare the accuracy for each evaluation data set.

Implementation

The implementation used Theano. This time, I uploaded the source code to My GitHub repository.

Regarding the learning coefficient optimization, I referred to this area. https://gist.github.com/SnippyHolloW/67effa81dd1cd5a488b4 https://gist.github.com/skaae/ae7225263ca8806868cb http://chainer.readthedocs.org/en/stable/reference/optimizers.html?highlight=optimizers http://qiita.com/skitaoka/items/e6afbe238cd69c899b2a

In the code below, params (or self.params) holds the weight and bias of the entire network. Since it is learned by the stochastic gradient descent method, the error function value `loss``` is differentiated by ``` params```, but gparams``` (or ``` self) is required. .gparams```) corresponds to that. (Differentiated by ``` T.grad (loss, param) ``.) The Optimizer class is used up to the point where the slope is obtained by differentiating, and SGD, Momentum SGD, etc. are implemented by inheriting it. I will.

optimizers.py


class Optimizer(object):
	def __init__(self, params=None):
		if params is None:
			return NotImplementedError()
		self.params = params

	def updates(self, loss=None):
		if loss is None:
			return NotImplementedError()

		self.updates = OrderedDict()
		self.gparams = [T.grad(loss, param) for param in self.params]

By the way, `` `self.updates``` here is used to update weights etc.

SGD

optimizers.py


class SGD(Optimizer):
	def __init__(self, learning_rate=0.01, params=None):
		super(SGD, self).__init__(params=params)
		self.learning_rate = 0.01

	def updates(self, loss=None):
		super(SGD, self).updates(loss=loss)

		for param, gparam in zip(self.params, self.gparams):
			self.updates[param] = param - self.learning_rate * gparam

		return self.updates	

Momentum SGD

optimizers.py


class MomentumSGD(Optimizer):
	def __init__(self, learning_rate=0.01, momentum=0.9, params=None):
		super(MomentumSGD, self).__init__(params=params)
		self.learning_rate = learning_rate
		self.momentum = momentum
		self.vs = [build_shared_zeros(t.shape.eval(), 'v') for t in self.params]

	def updates(self, loss=None):
		super(MomentumSGD, self).updates(loss=loss)

		for v, param, gparam in zip(self.vs, self.params, self.gparams):
			_v = v * self.momentum
			_v = _v - self.learning_rate * gparam
			self.updates[param] = param + _v
			self.updates[v] = _v

		return self.updates

AdaGrad

optimizers.py


class AdaGrad(Optimizer):
	def __init__(self, learning_rate=0.01, eps=1e-6, params=None):
		super(AdaGrad, self).__init__(params=params)

		self.learning_rate = learning_rate
		self.eps = eps
		self.accugrads = [build_shared_zeros(t.shape.eval(),'accugrad') for t in self.params]

	def updates(self, loss=None):
		super(AdaGrad, self).updates(loss=loss)

		for accugrad, param, gparam\
		in zip(self.accugrads, self.params, self.gparams):
			agrad = accugrad + gparam * gparam
			dx = - (self.learning_rate / T.sqrt(agrad + self.eps)) * gparam
			self.updates[param] = param + dx
			self.updates[accugrad] = agrad

		return self.updates

RMSprop

optimizers.py


class RMSprop(Optimizer):
	def __init__(self, learning_rate=0.001, alpha=0.99, eps=1e-8, params=None):
		super(RMSprop, self).__init__(params=params)

		self.learning_rate = learning_rate
		self.alpha = alpha
		self.eps = eps

		self.mss = [build_shared_zeros(t.shape.eval(),'ms') for t in self.params]

	def updates(self, loss=None):
		super(RMSprop, self).updates(loss=loss)

		for ms, param, gparam in zip(self.mss, self.params, self.gparams):
			_ms = ms*self.alpha
			_ms += (1 - self.alpha) * gparam * gparam
			self.updates[ms] = _ms
			self.updates[param] = param - self.learning_rate * gparam / T.sqrt(_ms + self.eps)

		return self.updates

AdaDelta

optimizers.py


class AdaDelta(Optimizer):
	def __init__(self, rho=0.95, eps=1e-6, params=None):
		super(AdaDelta, self).__init__(params=params)

		self.rho = rho
		self.eps = eps
		self.accugrads = [build_shared_zeros(t.shape.eval(),'accugrad') for t in self.params]
		self.accudeltas = [build_shared_zeros(t.shape.eval(),'accudelta') for t in self.params]

	def updates(self, loss=None):
		super(AdaDelta, self).updates(loss=loss)

		for accugrad, accudelta, param, gparam\
		in zip(self.accugrads, self.accudeltas, self.params, self.gparams):
			agrad = self.rho * accugrad + (1 - self.rho) * gparam * gparam
			dx = - T.sqrt((accudelta + self.eps)/(agrad + self.eps)) * gparam
			self.updates[accudelta] = (self.rho*accudelta + (1 - self.rho) * dx * dx)
			self.updates[param] = param + dx
			self.updates[accugrad] = agrad

		return self.updates

Adam

optimizers.py


class Adam(Optimizer):
	def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, gamma=1-1e-8, params=None):
		super(Adam, self).__init__(params=params)

		self.alpha = alpha
		self.b1 = beta1
		self.b2 = beta2
		self.gamma = gamma
		self.t = theano.shared(np.float32(1))
		self.eps = eps

		self.ms = [build_shared_zeros(t.shape.eval(), 'm') for t in self.params]
		self.vs = [build_shared_zeros(t.shape.eval(), 'v') for t in self.params]

	def updates(self, loss=None):
		super(Adam, self).updates(loss=loss)
		self.b1_t = self.b1 * self.gamma ** (self.t - 1)

		for m, v, param, gparam \
		in zip(self.ms, self.vs, self.params, self.gparams):
			_m = self.b1_t * m + (1 - self.b1_t) * gparam
			_v = self.b2 * v + (1 - self.b2) * gparam ** 2

			m_hat = _m / (1 - self.b1 ** self.t)
			v_hat = _v / (1 - self.b2 ** self.t)

			self.updates[param] = param - self.alpha*m_hat / (T.sqrt(v_hat) + self.eps)
			self.updates[m] = _m
			self.updates[v] = _v
		self.updates[self.t] = self.t + 1.0

		return self.updates

Experiment

Using MNIST, we averaged 30 seeds by 20 epochs. Please refer to My GitHub repository for detailed settings of learning coefficients and neural networks.

result

Well, I don't know because the upper one is messy, let's expand.

SGD has disappeared.

in conclusion

I should have taken the error function value. .. .. ..

We plan to add convolutional neural networks, Stacked Denoising Autoencoders, etc. to this GitHub repository in the future.

I would appreciate it if you could point out any strange points.

Recommended Posts

Experiment with various optimization algorithms with a neural network
Compose with a neural network! Run Magenta
Train MNIST data with a neural network in PyTorch
Implement a 3-layer neural network
Neural network with Python (scikit-learn)
3. Normal distribution with neural network!
Neural network starting with Chainer
4. Circle parameters with neural network!
Create a web application that recognizes numbers with a neural network
Try to build a deep learning / neural network with scratch
Neural network with OpenCV 3 and Python 3
Implementation of a two-layer neural network 2
Simple classification model with neural network
What is a Convolutional Neural Network?
Write a Residual Network with TFLearn
[TensorFlow] [Keras] Neural network construction with Keras
I implemented a two-layer neural network
Predict time series data with neural network
Build a classifier with a handwriting recognition rate of 99.2% with a TensorFlow convolutional neural network
Persist the neural network built with PyBrain
2. Mean and standard deviation with neural network!
Reinforcement learning 10 Try using a trained neural network.
Visualize the inner layer of a neural network
Verification of Batch Normalization with multi-layer neural network
Create a dashboard for Network devices with Django!
Parametric Neural Network
The story of making a music generation neural network
code-server Online environment (2) Create a virtual network with Boto3
Bayesian optimization implementation of neural network hyperparameters (Chainer + GPyOpt)
I tried to summarize four neural network optimization methods
Basics of PyTorch (2) -How to make a neural network-
Implementation of a convolutional neural network using only Numpy
I tried a convolutional neural network (CNN) with a tutorial on TensorFlow on Cloud9-Classification of handwritten images-