Online Linear Regression in Python (Robust Estimate)

Introduction

The following robust estimated version.

-Online Linear Regression in Python-Qiita

Formula

I will use the Biweight estimation method with some arrangement. See above for online estimation of coefficients.

--Reference: Tukey's biweight | Imaging Solution

Online estimation of mean value

\bar{x}_n = (1-\alpha) \bar{x}_{n-1} + \alpha x_n

Biweight estimation method

-$ W $ is the error tolerance --Find the error $ d $ from the already estimated $ a and b $, and decrease the weight as the error increases.

d = y-(ax+b)\\

w(d) = \left\{\begin{array}{l}
\left[ 1 - \left( \frac{d}{W} \right)^2 \right]^2 & \left(|d| \le W \right) \\
0 & \left(W < |d| \right)
\end{array}\right.

Explanatory

Enter an outlier with a deviation of 10 times about once every 10 times

outliers = [0,0,0,0,0,0,0,0,0,1]

...

y = x + 0.05 * sp.random.normal() + outliers[sp.random.randint(10)] * 0.5 * sp.random.normal()

Start 30 times as usual, then apply weight

W = 0.1
def biweight(d):
    return ( 1 - (d/W) ** 2 ) ** 2 if abs(d/W) < 1 else 0

...

if count[0] < 30:
    alpha = 0.1
else:
    d = y - (a * x + b)
    alpha = weight(d) * 0.1

Source code

(It's a mess because it's a defeat)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import scipy as sp
sp.seterr(divide='ignore', invalid='ignore')

def mean(old, new, alpha):
    return new if sp.isnan(old) else ( 1.0 - alpha ) * old + alpha * new

W = 0.1
def weight(d):
    return ( 1 - (d/W) ** 2 ) ** 2 if abs(d/W) < 1 else 0

def plot(fig):
    a = sp.array([sp.nan])
    b = sp.array([sp.nan])

    xyhist = sp.ones([100, 2]) * sp.nan
    mean_x0  = sp.array([sp.nan])
    mean_y0  = sp.array([sp.nan])
    mean_x20 = sp.array([sp.nan])
    mean_xy0 = sp.array([sp.nan])

    mean_x  = sp.array([sp.nan])
    mean_y  = sp.array([sp.nan])
    mean_x2 = sp.array([sp.nan])
    mean_xy = sp.array([sp.nan])

    ax = fig.gca()
    ax.hold(True)
    ax.grid(True)
    ax.set_xlim([0, 1.0])
    ax.set_ylim([0, 1.0])

    xyscat = ax.scatter([],[], c='black', s=10, alpha=0.4)
    approx0 = ax.add_line(plt.Line2D([], [], color='r'))
    approx = ax.add_line(plt.Line2D([], [], color='b'))

    outliers = [0,0,0,0,0,0,0,0,0,1]

    count = [0]

    def inner(i):
        x = sp.random.rand()
        y = x + 0.05 * sp.random.normal() + outliers[sp.random.randint(10)] * 0.5 * sp.random.normal()

        count[0] += 1
        if count[0] < 30:
            alpha = 0.1
        else:
            d = y - (a * x + b)
            alpha = biweight(d) * 0.1

        xyhist[:-1, :] = xyhist[1:, :]
        xyhist[-1, 0] = x
        xyhist[-1, 1] = y

        mean_x0[:]  = mean( mean_x0,  x,      0.1 )
        mean_y0[:]  = mean( mean_y0,  y,      0.1 )
        mean_xy0[:] = mean( mean_xy0, x *  y, 0.1 )
        mean_x20[:] = mean( mean_x20, x ** 2, 0.1 )

        mean_x[:]  = mean( mean_x,  x,      alpha )
        mean_y[:]  = mean( mean_y,  y,      alpha )
        mean_xy[:] = mean( mean_xy, x *  y, alpha )
        mean_x2[:] = mean( mean_x2, x ** 2, alpha )

        a0 = ( mean_xy0 - mean_x0 * mean_y0 ) / ( mean_x20 - mean_x0 ** 2 )
        b0 = mean_y0 - a0 * mean_x0

        a[:] = ( mean_xy - mean_x * mean_y ) / ( mean_x2 - mean_x ** 2 )
        b[:] = mean_y - a * mean_x

        ax.title.set_text('y = %.3fx %+.3f' % (a, b))
        xyscat.set_offsets(xyhist)
        approx.set_data([0, 1], [b, a*1+b])
        approx0.set_data([0, 1], [b0, a0*1+b0])
        plt.draw()

    return inner


fig = plt.figure()
ani = animation.FuncAnimation(fig, plot(fig), interval=100, frames=300)
ani.save('result2.gif', writer='imagemagick')

result

It seems that biweight (blue) is more stable to some extent.