Since one model cannot capture the characteristics of data, a mixed model is used when you want to combine multiple models. This time, we will implement a mixture of linear regression models.
Probabilistically representing a normal linear regression model with input $ x $ and output $ t $,
p(t|x,{\bf w},\beta) = \mathcal{N}(t|{\bf w}^\top\phi(x),\beta^{-1})
It looks like. However, $ \ phi (\ cdot) $ is the feature vector, $ {\ bf w} $ is the weighting factor, and $ \ beta $ is the precision parameter. In the mixed linear regression model, we introduce the mixing coefficient $ \ pi_k $ (where $ \ sum_k \ pi_k = 1 $) and add up the K linear regression models to express as follows.
p(t|x,{\bf W},\beta,\pi) = \sum_{k=1}^K\pi_k\mathcal{N}(t|{\bf w}_k^\top\phi(x),\beta^{-1})
As usual, the parameter $ {\ bf W when the training data $ \ {x_n, t_n \} _ {n = 1} ^ N $ (hereinafter referred to as $ {\ bf t, x} $) is given. }, \ pi, \ beta $ is the maximum likelihood estimate. The log-likelihood function at that time is
\ln p({\bf t}|{\bf x},{\bf W},\pi,\beta) = \sum_{n=1}^N\ln\left(\sum_{k=1}^K\pi_k\mathcal{N}(t_n|{\bf w}_k^\top\phi(x_n),\beta^{-1})\right)
It will be. Now we introduce a latent variable $ z_ {nk} $ that represents from which model the individual data was generated. This latent variable takes a value of 0 or 1, and if $ z_ {nk} = 1 $, it means that the nth data point is generated from the kth model, $ 1, \ cdots, k-1. , K + 1, \ cdots, K $ are 0. Now you can also write down the log-likelihood function for the parameters given the complete data.
In this way, the EM algorithm can be used for maximum likelihood estimation of the mixed linear regression model. In step E, find $ \ mathbb {E} [z_ {nk}] $ for each data point and model, and in step M, after the fact about the latent variable $ z $ of the log-likelihood function when complete data is given. Maximizes the expected value of the distribution for the parameters.
import This time, in addition to the usual numpy and matplotlib, we will also import the standard python library, itertools. (I don't really need it)
import itertools
import matplotlib.pyplot as plt
import numpy as np
Create a design matrix in which the row vectors represent each feature vector.
class PolynomialFeatures(object):
def __init__(self, degree=2):
assert isinstance(degree, int)
self.degree = degree
def transform(self, x):
if x.ndim == 1:
x = x[:, None]
x_t = x.transpose()
features = [np.ones(len(x))]
for degree in range(1, self.degree + 1):
for items in itertools.combinations_with_replacement(x_t, degree):
features.append(reduce(lambda x, y: x * y, items))
return np.asarray(features).transpose()
This is the code for doing the mixed linear regression model, which is the main part.
class ConditionalMixtureModel(object):
def __init__(self, n_models=2):
#Specify the number of models
self.n_models = n_models
#Method for maximum likelihood estimation
def fit(self, X, t, n_iter=100):
#Initialization of the parameters you want to estimate (weighting factor, mixing factor, variance)
coef = np.linalg.pinv(X).dot(t)
self.coef = np.vstack((
coef + np.random.normal(size=len(coef)),
coef + np.random.normal(size=len(coef)))).T
self.var = np.mean(np.square(X.dot(coef) - t))
self.weight = np.ones(self.n_models) / self.n_models
#Repeat the EM step a specified number of times
for i in xrange(n_iter):
#E step burden rate E[z_nk]Is calculated for each data and each model
resp = self._expectation(X, t)
#Maximize for M-step parameters
self._maximization(X, t, resp)
#Gaussian function
def _gauss(self, x, y):
return np.exp(-(x - y) ** 2 / (2 * self.var))
#E step burden rate gamma_nk=E[z_nk]Calculate
def _expectation(self, X, t):
#PRML formula(14.37)
responsibility = self.weight * self._gauss(X.dot(self.coef), t[:, None])
responsibility /= np.sum(responsibility, axis=1, keepdims=True)
return responsibility
#Maximize for M-step parameters
def _maximization(self, X, t, resp):
#PRML formula(14.38)Calculation of mixing coefficient
self.weight = np.mean(resp, axis=0)
for m in xrange(self.n_models):
#PRML formula(14.42)Calculate the coefficients for each model
self.coef[:, m] = np.linalg.inv((X.T * resp[:, m]).dot(X)).dot(X.T * resp[:, m]).dot(t)
#PRML formula(14.44)Variance calculation
self.var = np.mean(
np.sum(resp * np.square(t[:, None] - X.dot(self.coef)), axis=1))
def predict(self, X):
return X.dot(self.coef)
def create_toy_data(sample_size=20, std=0.1):
x = np.linspace(-1, 1, sample_size)
y = (x > 0.).astype(np.float) * 2 - 1
y = x * y
y += np.random.normal(scale=std, size=sample_size)
return x, y
def main():
#Creation of training data
x_train, y_train = create_toy_data()
#Definition of feature vector (order 1)
feature = PolynomialFeatures(degree=1)
#Conversion to design matrix
X_train = feature.transform(x_train)
#Preparation of conditional mixed model (2 models)
model = ConditionalMixtureModel(n_models=2)
#Maximum likelihood estimation of parameters
model.fit(X_train, y_train)
#Illustrated results
x = np.linspace(-1, 1, 100)
X = feature.transform(x)
Y = model.predict(X)
plt.scatter(x_train, y_train, s=50, facecolor="none", edgecolor="g")
plt.plot(x, Y[:, 0], c="b")
plt.plot(x, Y[:, 1], c='r')
plt.show()
import itertools
import matplotlib.pyplot as plt
import numpy as np
class PolynomialFeatures(object):
def __init__(self, degree=2):
assert isinstance(degree, int)
self.degree = degree
def transform(self, x):
if x.ndim == 1:
x = x[:, None]
x_t = x.transpose()
features = [np.ones(len(x))]
for degree in range(1, self.degree + 1):
for items in itertools.combinations_with_replacement(x_t, degree):
features.append(reduce(lambda x, y: x * y, items))
return np.asarray(features).transpose()
class ConditionalMixtureModel(object):
def __init__(self, n_models=2):
self.n_models = n_models
def fit(self, X, t, n_iter=100):
coef = np.linalg.pinv(X).dot(t)
self.coef = np.vstack((
coef + np.random.normal(size=len(coef)),
coef + np.random.normal(size=len(coef)))).T
self.var = np.mean(np.square(X.dot(coef) - t))
self.weight = np.ones(self.n_models) / self.n_models
for i in xrange(n_iter):
resp = self._expectation(X, t)
self._maximization(X, t, resp)
def _gauss(self, x, y):
return np.exp(-(x - y) ** 2 / (2 * self.var))
def _expectation(self, X, t):
responsibility = self.weight * self._gauss(X.dot(self.coef), t[:, None])
responsibility /= np.sum(responsibility, axis=1, keepdims=True)
return responsibility
def _maximization(self, X, t, resp):
self.weight = np.mean(resp, axis=0)
for m in xrange(self.n_models):
self.coef[:, m] = np.linalg.inv((X.T * resp[:, m]).dot(X)).dot(X.T * resp[:, m]).dot(t)
self.var = np.mean(
np.sum(resp * np.square(t[:, None] - X.dot(self.coef)), axis=1))
def predict(self, X):
return X.dot(self.coef)
def create_toy_data(sample_size=20, std=0.1):
x = np.linspace(-1, 1, sample_size)
y = (x > 0.).astype(np.float) * 2 - 1
y = x * y
y += np.random.normal(scale=std, size=sample_size)
return x, y
def main():
x_train, y_train = create_toy_data()
feature = PolynomialFeatures(degree=1)
X_train = feature.transform(x_train)
model = ConditionalMixtureModel(n_models=2)
model.fit(X_train, y_train)
x = np.linspace(-1, 1, 100)
X = feature.transform(x)
Y = model.predict(X)
plt.scatter(x_train, y_train, s=50, facecolor="none", edgecolor="g")
plt.plot(x, Y[:, 0], c="b")
plt.plot(x, Y[:, 1], c='r')
plt.show()
if __name__ == '__main__':
main()
Non-linear functions by combining multiple linear models
When the conditional mixed model implemented this time is used for prediction, the line extends even where there are no data points, as shown in the above result. The mixed expert model, which is a further development of the conditional mixed model, seems to solve this problem by making the mixing coefficient $ \ pi $ a function of the input $ x $.
Recommended Posts