lst = [-1, -0.7, -0.3, 0, 0.3, 0.7, 1]
fig, ax = plt.subplots(1, len(lst), figsize=(10*len(lst), 10))
for idx, corrcoef in enumerate(lst):
mean = np.array([0, 0])
cov = np.array([[1, corrcoef], [corrcoef, 1]])
x, y = np.random.multivariate_normal(mean, cov, 5000).T
ax[idx].scatter(x, y, color='royalblue')
ax[idx].set_title(f'corrcoef = {corrcoef:.2f}', size=50)
ax[idx].tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
x = np.random.randint(1, 1000, 1000)
y = ((x-400) ** 3 - 100 * (x-200) ** 2 + 100000000) / 1000000
corr_coef = np.corrcoef(x, y)[0, 1] #Correlation matrix
fig, ax = plt.subplots()
ax.scatter(x,y,color='royalblue')
ax.set_title(f'corr={corr_coef:.3f}', size=18)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
x = np.random.randint(1, 1000, 100)
y = (x - 500 ) ** 2 / 100 + 300
corr_coef = np.corrcoef(x, y)[0, 1]
fig, ax = plt.subplots()
ax.scatter(x,y,color='royalblue')
ax.set_title(f'corr={corr_coef:.3f}', size=18)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
Looking at the entire data (left figure), the correlation coefficient is low,
An example in which there is a high correlation when narrowing down the range of data with x = 900
or more (right figure).
x = np.random.randint(900, 1000, 1000)
noise = np.random.randn(1000)
y = x + 10 * noise
corr_coef = np.corrcoef(x, y)[0, 1]
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[1].scatter(x,y,color='royalblue')
ax[1].set_title(f'corr={corr_coef:.3f}',size=18)
ax[1].tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
x2 = list(x) + [600, 700, 800]
y2 = list(y) + [2000, 1800, 1500]
corr_coef = np.corrcoef(x2, y2)[0, 1]
ax[0].scatter(x2,y2,color='royalblue')
ax[0].set_title(f'corr={corr_coef:.3f}',size=18)
ax[0].tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
import numpy as np
import matplotlib.pyplot as plt
x = np.array([2.0, 3.5, 4.0, 4.5, 5.0, 5.5])
y = np.array([3.0, 3.2, 3.9, 5.2, 8.4, 10.5])
xp = np.linspace(0, 8, 100)
for val in range(1, 2):
fx = np.poly1d(np.polyfit(x, y, val))
fig, ax = plt.subplots()
ax.plot(xp, fx(xp), '-', color='blue')
ax.scatter(x, y, color='deepskyblue', s=32)
ax.text(0.05, 0.8, s=f'y = {fx.coef[0]:.2f} x {fx.coef[1]:.2f}',size='x-large', transform=ax.transAxes)
ax.axhline([0], color='black')
ax.set_xlim(0, None)
ax.set_ylim(-3, 14)
ax.set_ylabel('Cost [JPY]')
ax.set_xlabel('Explanatory variables')
import numpy as np
import matplotlib.pyplot as plt
x = np.array([2.0, 3.5, 4.0, 4.5, 5.0, 5.5])
y = np.array([3.0, 3.2, 3.9, 5.2, 8.4, 10.5])
xp = np.linspace(2, 5.5, 100)
xp1 = np.linspace(0, 2, 100)
xp2 = np.linspace(5.5, 8, 100)
for val in range(1, 2):
fx = np.poly1d(np.polyfit(x, y, val))
fig, ax = plt.subplots()
ax.plot(xp, fx(xp), '-', color='blue')
ax.plot(xp1, fx(xp1), '-', color='red', linestyle='dashed')
ax.plot(xp2, fx(xp2), '-', color='red', linestyle='dashed')
ax.scatter(x, y, color='deepskyblue', s=32)
ax.text(0.05, 0.8, s=f'y = {fx.coef[0]:.2f} x {fx.coef[1]:.2f}',size='x-large', transform=ax.transAxes)
ax.axhline([0], color='black')
ax.axvline([2], color='gray', linestyle='dotted')
ax.axvline([5.5], color='gray', linestyle='dotted')
ax.set_xlim(0, 8)
ax.set_ylim(-3, 14)
ax.set_ylabel('Cost [JPY]')
ax.set_xlabel('Explanatory variables')
If you increase the order or increase the number of explanatory variables, It fits the trained data, but the accuracy of unknown data prediction decreases. The figure shows the case where the order is increased.
import numpy as np
import matplotlib.pyplot as plt
x = np.array([2.0, 3.5, 4.0, 4.5, 5.0, 5.5])
y = np.array([3.0, 3.2, 3.9, 5.2, 8.4, 10.5])
xp = np.linspace(0, 8, 100)
for val in range(2, 6):
fx = np.poly1d(np.polyfit(x, y, val))
fig, ax = plt.subplots()
ax.plot(xp, fx(xp), '-', color='blue')
ax.scatter(x, y, color='deepskyblue', s=32)
ax.axhline([0], color='black')
ax.set_xlim(0, None)
ax.set_ylim(-3, 14)
ax.set_ylabel('Cost [JPY]')
ax.set_xlabel('Explanatory variables')
ax.text(0.75, 0.85, s=f'Digree = {val}',size='x-large', transform=ax.transAxes)
Recommended Posts