When I implemented Maximum likelihood estimation for Student's t distribution, I said that I would apply this distribution to linear regression as well. I tried this time.

Linear regression

Assuming that $ \ {x_n, t_n \} _ {n = 1} ^ N $ is the training data, in linear regression, $ {\ bf \ phi} (\ cdot) $ is the feature vector and the square error is Estimate the minimum parameter $ {\ bf w} ^ * $.

{\bf w}^* = \arg \min_{\bf w}\sum_{n=1}^N(t_n - {\bf w}^\top{\bf \phi}(x_n))^2

This is the same as estimating the parameters that maximize the likelihood function using the following Gaussian distribution when reinterpreted using the probability distribution.

\prod_{n=1}^N\mathcal{N}(t_n|{\bf w}^\top{\bf\phi}(x_n), \sigma^2)

The Gaussian distribution is often used in this sense as well, but it also has some drawbacks, such as being relatively insecure against noise. This time, we estimate the parameters using the Student's t distribution in the likelihood function.

\prod_{n=1}^N{\rm St}(t_n|{\bf w}^\top{\bf\phi}(x_n),\lambda,\nu)

The outline of the Student's t distribution is shown in the figure below (from PRML Figure 2.15). When it is $ \ nu \ to \ infinity $ (green), it has the same shape as the Gaussian distribution. Compared to the green distribution, the blue and red distributions have a wider hem, making them more robust against outliers.

code

`students_t_regression.py`


import functools
import itertools
import matplotlib.pyplot as plt
import numpy as np
import scipy.special as sp


class PolynomialFeatures(object):

    def __init__(self, degree=2):
        assert isinstance(degree, int)
        self.degree = degree

    def transform(self, x):
        if x.ndim == 1:
            x = x[:, None]
        x_t = x.transpose()
        features = [np.ones(len(x))]
        for degree in range(1, self.degree + 1):
            for items in itertools.combinations_with_replacement(x_t, degree):
                features.append(functools.reduce(lambda x, y: x * y, items))
        return np.asarray(features).transpose()


class LinearRegressor(object):

    def fit(self, X, t):
        self.w = np.linalg.pinv(X) @ t

    def predict(self, X):
        return X @ self.w


class RobustRegressor(LinearRegressor):

    def __init__(self, precision=1., dof=1.):
        self.precision = precision
        self.dof = dof

    def fit(self, X, t, learning_rate=0.01):
        super().fit(X, t)
        params = np.hstack((self.w.ravel(), self.precision, self.dof))
        while True:
            E_precision, E_ln_precision = self._expect(X, t)
            self._maximize(X, t, E_precision, E_ln_precision, learning_rate)
            params_new = np.hstack((self.w.ravel(), self.precision, self.dof))
            if np.allclose(params, params_new):
                break
            params = np.copy(params_new)

    def _expect(self, X, t):
        sq_diff = (t - X @ self.w) ** 2
        E_eta = (self.dof + 1) / (self.dof + self.precision * sq_diff)
        E_ln_eta = (
            sp.digamma(0.5 * (self.dof + 1))
            - np.log(0.5 * (self.dof + self.precision * sq_diff)))
        return E_eta, E_ln_eta

    def _maximize(self, X, t, E_eta, E_ln_eta, learning_rate):
        sq_diff = (t - X @ self.w) ** 2
        self.w = np.linalg.inv(X.T @ (E_eta[:, None] * X)) @ X.T @ (E_eta * t)
        self.precision = 1 / np.mean(E_eta * sq_diff)
        N = len(t)
        self.dof += learning_rate * (
            N * np.log(0.5 * self.dof) + N
            - N * sp.digamma(0.5 * self.dof)
            + np.sum(E_ln_eta - E_eta))


def create_toy_data(n):
    x = np.linspace(-1, 1, n)
    y = np.random.randint(-5, 6) * x + np.random.normal(scale=0.1, size=n)
    y[np.random.randint(len(y))] += np.random.uniform(-10, 10)
    return x, y


def main():
    x_train, y_train = create_toy_data(10)

    feature = PolynomialFeatures(degree=1)
    X_train = feature.transform(x_train)

    linear_regressor = LinearRegressor()
    robust_regressor = RobustRegressor()
    linear_regressor.fit(X_train, y_train)
    robust_regressor.fit(X_train, y_train)

    x = np.linspace(-1, 1, 100)
    X = feature.transform(x)
    y_linear = linear_regressor.predict(X)
    y_robust = robust_regressor.predict(X)

    plt.scatter(
        x_train, y_train,
        facecolors="none", edgecolors="g", label="training data")
    plt.plot(x, y_linear, c="b", label="Gaussian")
    plt.plot(x, y_robust, c="r", label="Student's t")
    plt.legend(loc="best")
    plt.show()


if __name__ == '__main__':
    main()

result

In the blue line model using the Gaussian distribution for the likelihood function, it is pulled to outliers, but in the Student's t distribution, even if there are outliers, it is stubbornly linearly regressed.

Linear regression with Student's t distribution

Linear regression

code

students_t_regression.py

result

`students_t_regression.py`