You may see a graph that makes you wonder if you can draw such a regression line. For example
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman" #Set the entire font
plt.rcParams["xtick.direction"] = "in" #Inward the x-axis scale line
plt.rcParams["ytick.direction"] = "in" #Inward the y-axis scale line
plt.rcParams["xtick.minor.visible"] = True #Addition of x-axis auxiliary scale
plt.rcParams["ytick.minor.visible"] = True #Addition of y-axis auxiliary scale
plt.rcParams["xtick.major.width"] = 1.5 #Line width of x-axis main scale line
plt.rcParams["ytick.major.width"] = 1.5 #Line width of y-axis main scale line
plt.rcParams["xtick.minor.width"] = 1.0 #Line width of x-axis auxiliary scale line
plt.rcParams["ytick.minor.width"] = 1.0 #Line width of y-axis auxiliary scale line
plt.rcParams["xtick.major.size"] = 10 #x-axis main scale line length
plt.rcParams["ytick.major.size"] = 10 #Length of y-axis main scale line
plt.rcParams["xtick.minor.size"] = 5 #x-axis auxiliary scale line length
plt.rcParams["ytick.minor.size"] = 5 #Length of y-axis auxiliary scale line
plt.rcParams["font.size"] = 14 #Font size
plt.rcParams["axes.linewidth"] = 1.5 #Enclosure thickness
def func(x):
return 1.5*x+2.0 + np.random.normal(0.0,0.1,len(x))
x = np.array([0.0,0.2,0.5,1.0,10.0])
y = func(x) #Measurement point
a,b = np.polyfit(x,y,1) #Linear regression
fig,axes = plt.subplots()
axes.scatter(x,y)
axes.plot(x,a*x+b)
I was able to draw a straight line by linear regression. However, there are no measurement points between 1 and 10, and this part seems intuitively uncertain. When I searched for a good regression analysis that could express such uncertainty, I found something called Gaussian process regression, so I would like to implement it for study.
Even I, who is not familiar with mathematics, could read it. This post corresponds to Chapters 2 and 3. Gaussian process and machine learning
Hyperparameters exist in Gaussian process regression, and the MCMC method was used to estimate them. Sampling by Markov Chain Monte Carlo (MCMC) with emcee
There is a library without having to implement it yourself. [Gpy vs scikit-learn: Gaussian process regression with python](https://nykergoto.hatenablog.jp/entry/2017/05/29/python%E3%81%A7%E3%82%AC%E3%82% A6% E3% 82% B9% E9% 81% 8E% E7% A8% 8B% E5% 9B% 9E% E5% B8% B0_ ~ % E3% 83% A2% E3% 82% B8% E3% 83% A5% E3% 83% BC% E3% 83% AB% E3% 81% AE% E6% AF% 94% E8% BC% 83 ~)
For any natural number $ N $, the vector of the output corresponding to the input $ x_1, x_2, ..., x_N $
However, I am not sure about the nature of the Gaussian process, so I will try to calculate the kernel matrix using the radial basis function as a kernel function and sample from the Gaussian distribution that follows the kernel matrix.
def rbf(x,s=[1.0,1.0,1.0e-6]):
"""
Radial Basis Function, RBF :Radial basis function
"""
s1,s2,s3 = s[0],s[1],s[2]
X, X_ = np.meshgrid(x,x)
K = s1**2 * np.exp(-((X-X_)/s2)**2) + np.identity(len(x)) * s3**2
return K
N = 100
x = np.linspace(0.0,10.0,N) #input
K = rbf(x) #Kernel matrix
fig,axes = plt.subplots()
axes.imshow(K)
To sample from a multidimensional Gaussian distribution with a covariance matrix of $ \ rm K $, randomly generate $ \ mathbf x $ and convert it to $ \ mathbf {y} = \ mathbf {Lx} $ .. $ \ mathbf L $ is obtained by Cholesky decomposition of $ {\ rm K} $.
L = np.linalg.cholesky(K) #Cholesky decomposition of kernel matrix
fig,axes = plt.subplots()
axes.plot()
for i in range(10):
_x = np.random.normal(0.0,1.0,N)
axes.plot(x,np.dot(L,_x))
Functions will appear randomly, just as numbers appear when you roll the dice. Also, it seems that the intuitive nature of the Gaussian process is that similar values come out from similar values.
There are $ N $ measurements. For simplicity, the average value of y is 0.
Here's how to find the value of $ y ^ {\ ast} $ at $ x ^ {\ ast} $ that isn't included in the data.
$ \ Mathbf y $ plus $ y ^ {\ ast} $ $ {\ mathbf y} ^ {'} = (y_1, y_2, ..., y_N, y ^ {\ ast}) ^ { Let T} $. If you do $ \ rm K ^ {'} $ on the kernel matrix calculated from $ (x_1, x_2, ..., x_N, x ^ {\ ast}) ^ {T}
Write the code according to the above formula.
def rbf(x_train,s,x_pred=None):
"""
Radial Basis Function, RBF :Radial basis function
"""
s1,s2,s3 = s[0],s[1],s[2]
if x_pred is None:
x = x_train
X, X_ = np.meshgrid(x,x)
K = s1**2 * np.exp(-((X-X_)/s2)**2) + np.identity(len(x)) * s3**2 #Kernel matrix
return K
else:
x = np.append(x_train,x_pred)
X, X_ = np.meshgrid(x,x)
K_all = s1**2 * np.exp(-((X-X_)/s2)**2) + np.identity(len(x)) * s3**2 #Kernel matrix
K = K_all[:len(x_train),:len(x_train)]
K_s = K_all[:len(x_train),len(x_train):]
K_ss = K_all[len(x_train):,len(x_train):]
return K,K_s,K_ss
def pred(x_train,y_train,x_pred,s):
K,K_s,K_ss = rbf(x_train,s,x_pred=x_pred)
K_inv = np.linalg.inv(K) #Inverse matrix
y_pred_mean = np.dot(np.dot(K_s.T,K_inv), y_train) #Expected value of y
y_pred_cov = K_ss - np.dot(np.dot(K_s.T,K_inv), K_s) #Covariance matrix
y_pred_std = np.sqrt(np.diag(y_pred_cov)) #standard deviation
return y_pred_mean,y_pred_std
Create an appropriate function, sample it, and use it as the measured value.
def func(x):
return np.sin(2.0*np.pi*0.2*x) + np.sin(2.0*np.pi*0.1*x)
pred_N = 100 #Number of predicted points
N = 10 #Number of measurement points
x_train = np.random.uniform(0.0,10.0,N) #Measurement point:x
y_train = func(x_train) + np.random.normal(0,0.1,1) #Measurement point:y
x_true = x_pred = np.linspace(-2.0,12.0,pred_N)
fig,axes = plt.subplots()
axes.plot(x_true, func(x_true), label="True")
axes.scatter(x_train,y_train, label="Measured")
axes.legend(loc="best")
Let's do Gaussian process regression.
s = [1.0,1.0,0.1]
y_pred_mean,y_pred_std = pred(x_train,y_train,x_pred,s)
fig,axes = plt.subplots(figsize=(8,6))
axes.set_title("s1=1.0, s2=1.0, s3=0.1")
axes.plot(x_true, func(x_true), label="True")
axes.scatter(x_train,y_train, label="Measured")
axes.plot(x_pred, y_pred_mean, label="Predict")
plt.fill_between(x_pred,y_pred_mean+y_pred_std,y_pred_mean-y_pred_std,facecolor="b",alpha=0.3)
axes.legend(loc="best")
axes.set_xlim(-2.0,12.0)
I was able to make a regression like that. The distribution is widespread where the number of measurement points is small. Now the hyperparameters of the kernel function are given by hand. Let's see what happens if we change this.
The result has changed a lot. I also want to estimate hyperparameters.
Put the hyperparameters together as $ \ theta $. The kernel matrix depends on $ \ theta $, where the probability of the training data is
If you take the logarithm
The MCMC method is used because the gradient method is easy to get into the local solution.
import emcee
def objective(s): #Objective function
K = rbf(x_train,s)
return -np.linalg.slogdet(K)[1] - y_train.T.dot(np.linalg.inv(K)).dot(y_train) #evidence
ndim = 3
nwalker = 6
s0 = np.random.uniform(0.0,5.0,[nwalker,ndim]) #Initial position
sampler = emcee.EnsembleSampler(nwalker,ndim,objective) #Make a sampler
sampler.run_mcmc(s0,5000) #Start sampling
s_dist = sampler.flatchain #Get the result of sampling
s = s_dist[sampler.flatlnprobability.argmax()]
y_pred_mean,y_pred_std = pred(x_train,y_train,x_pred,s)
fig,axes = plt.subplots(figsize=(8,6))
axes.plot(x_true, func(x_true), label="True")
axes.scatter(x_train,y_train, label="Measured")
axes.plot(x_pred, y_pred_mean, label="Predict")
plt.fill_between(x_pred,y_pred_mean+y_pred_std,y_pred_mean-y_pred_std,facecolor="b",alpha=0.3)
axes.legend(loc="best")
axes.set_xlim(-2.0,12.0)
It feels good.
There is a handy library of Gaussian process regression. It is recommended because it has a wide variety of kernels and is easy to visualize.
import GPy
import GPy.kern as gp_kern
kern = gp_kern.RBF(input_dim=1)
gpy_model = GPy.models.GPRegression(X=x_train.reshape(-1, 1), Y=y_train.reshape(-1, 1), kernel=kern, normalizer=None)
fig,axes = plt.subplots(figsize=(8,6))
axes.plot(x_true, func(x_true), c="k", label="True")
gpy_model.optimize()
gpy_model.plot(ax=axes)
axes.legend(loc="best")
axes.set_xlim(-2.0,12.0)
Gaussian process regression made an estimate including uncertainty. In this post, only the basic theory part was considered, but Gaussian process regression has a problem of computational complexity, and various discussions have been made on how to solve it. In addition, there are many types of kernel functions, not just one type, and it is necessary to select and combine them appropriately according to the analysis target.
Recommended Posts