** Note: [Improved version] Try MNIST with VAT (Virtual Adversarial Training) in Keras seems to be a better implementation. Please refer to it. ** **
Lately, I learned a learning method called VAT (Virtual Adversarial Training), but I couldn't find an implementation in Keras, so I tried it.
To put it simply, VAT is from "normal input X-> output Y" and "input (X + d)-> output Y'" with a small noise d added to the input so that the result is as different as possible. -Divergence (Y, Y') "is added to the loss function for learning.
I don't know what you're saying, so for more information, see the original paper or Explanation of this person -Training /) I think you should take a look.
It is said that VAT is close to "regularization" in terms of its position in learning, and it may be an alternative to adding Dropout and Noise. It's also troublesome to adjust parameters such as Dropout, so I'd be happy if VAT could be used instead.
With Keras, it's a bit tricky to use input X for cost and regularization functions, but if that's the case, it's OK to port it because it's implemented in Chainer and Theano.
It turned out to be something like this.
The point is
y_true
with X_train
attached.Container
is used so that the calculation can be reused.K.stop_gradient ()
after K.gradients ()
, Loss will become nan
as you learn (I was addicted to this ...)I think that is the place.
keras_mnist_vat.py
# coding: utf8
"""
* VAT: https://arxiv.org/abs/1507.00677
#Referenced Code
Original: https://github.com/fchollet/keras/blob/master/examples/mnist_cnn.py
VAT: https://github.com/musyoku/vat/blob/master/vat.py
# Result Example
use_dropout=False, use_vat=False: score=0.211949993095, accuracy=0.9877
use_dropout=True, use_vat=False: score=0.238920686956, accuracy=0.9853
use_dropout=False, use_vat=True: score=0.180048364889, accuracy=0.9916
use_dropout=True, use_vat=True: score=0.245401585515, accuracy=0.9901
"""
import numpy as np
from keras.engine.topology import Input, Container
from keras.engine.training import Model
np.random.seed(1337) # for reproducibility
from keras.datasets import mnist
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
SAMPLE_SIZE = 0
batch_size = 128
nb_classes = 10
nb_epoch = 12
# input image dimensions
img_rows, img_cols = 28, 28
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
pool_size = (2, 2)
# convolution kernel size
kernel_size = (3, 3)
def main(data, use_dropout, use_vat):
# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = data
if K.image_dim_ordering() == 'th':
X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255.
X_test /= 255.
# convert class vectors to binary class matrices
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)
if SAMPLE_SIZE:
X_train = X_train[:SAMPLE_SIZE]
y_train = y_train[:SAMPLE_SIZE]
X_test = X_test[:SAMPLE_SIZE]
y_test = y_test[:SAMPLE_SIZE]
my_model = MyModel(input_shape, use_dropout).build()
my_model.training(X_train, y_train, X_test, y_test, use_vat=use_vat)
score = my_model.model.evaluate(X_test, y_test, verbose=0)
print("use_dropout=%s, use_vat=%s: score=%s, accuracy=%s" % (use_dropout, use_vat, score[0], score[1]))
class MyModel:
model = None
core_layers = None
def __init__(self, input_shape, use_dropout=True):
self.input_shape = input_shape
self.use_dropout = use_dropout
def build(self):
input_layer = Input(self.input_shape)
output_layer = self.core_data_flow(input_layer)
self.model = Model(input_layer, output_layer)
return self
def core_data_flow(self, input_layer):
x = Convolution2D(nb_filters, kernel_size[0], kernel_size[1], border_mode='valid')(input_layer)
x = Activation('relu')(x)
x = Convolution2D(nb_filters, kernel_size[0], kernel_size[1])(x)
x = Activation('relu')(x)
x = MaxPooling2D(pool_size=pool_size)(x)
if self.use_dropout:
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(128, activation="relu")(x)
if self.use_dropout:
x = Dropout(0.5)(x)
x = Dense(nb_classes, activation='softmax')(x)
self.core_layers = Container(input_layer, x)
return x
def training(self, X_train, y_train, X_test, y_test, use_vat=False):
orig_loss_func = loss_func = K.categorical_crossentropy
if use_vat:
# y_concat to true(y_true, x_train)Use an anomalous Loss Function to take
loss_func = self.loss_with_vat_loss(loss_func)
self.model.compile(loss=loss_func, optimizer='adadelta', metrics=['accuracy'])
# train,test both y,Create data by concatenating X horizontally
yX_train = np.concatenate((y_train, X_train.reshape((X_train.shape[0], -1))), axis=1)
yX_test = np.concatenate((y_test, X_test.reshape((X_test.shape[0], -1))), axis=1)
#Learn normally
self.model.fit(X_train, yX_train, batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, validation_data=(X_test, yX_test))
#I gave an anomalous LossFunction, so I have to change it to a normal LossFunction and compile it again, or evaluate()Will fail
self.model.compile(loss=orig_loss_func, optimizer='adadelta', metrics=['accuracy'])
else:
self.model.compile(loss=loss_func, optimizer='adadelta', metrics=['accuracy'])
self.model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, validation_data=(X_test, y_test))
def loss_with_vat_loss(self, original_loss_func, eps=1, xi=10, ip=1):
def with_vat_loss(yX_train, y_pred):
nb_output_classes = y_pred.shape[1]
y_true = yX_train[:, :nb_output_classes]
# VAT
X_train = yX_train[:, nb_output_classes:].reshape((-1, ) + self.input_shape)
d = K.random_normal(X_train.shape)
for _ in range(ip):
y = self.core_layers(X_train + self.normalize_vector(d) * xi)
kld = K.sum(self.kld(y_pred, y))
d = K.stop_gradient(K.gradients(kld, [d])[0]) # stop_gradient is important!!
y_perturbation = self.core_layers(X_train + self.normalize_vector(d)*eps)
kld = self.kld(y_pred, y_perturbation)
return original_loss_func(y_pred, y_true) + kld
return with_vat_loss
@staticmethod
def normalize_vector(x):
z = K.sum(K.batch_flatten(K.square(x)), axis=1)
while K.ndim(z) < K.ndim(x):
z = K.expand_dims(z, dim=-1)
return x / (K.sqrt(z) + K.epsilon())
@staticmethod
def kld(p, q):
v = p * (K.log(p + K.epsilon()) - K.log(q + K.epsilon()))
return K.sum(K.batch_flatten(v), axis=1, keepdims=True)
data = mnist.load_data()
main(data, use_dropout=False, use_vat=False)
main(data, use_dropout=True, use_vat=False)
main(data, use_dropout=False, use_vat=True)
main(data, use_dropout=True, use_vat=True)
I experimented with 4 patterns with / without Dropout and with / without VAT. 1 epoch time is GeForce GTX 1080, environment variable
KERAS_BACKEND=theano
THEANO_FLAGS=device=gpu,floatX=float32,lib.cnmem=1
It is when it is executed as.
Dropout | VAT | Accuracy | 1 epoch time |
---|---|---|---|
do not use | do not use | 98.77% | 8 seconds |
use | do not use | 98.53% | 8 seconds |
do not use | use | 99.16% | 18 seconds |
use | use | 99.01% | 19 seconds |
Well, the result is reasonably good, so I think that there is no problem in terms of implementation. Since VAT calculates twice in 1 batch, the execution time is also doubled.
VAT can also be used for unsupervised learning, so it is a learning method with a wide range of applications. I want to use it in various ways.
Recommended Posts