I will do the part that reproduces the numerical example of spurious correlation that came out while reading "Statistical Causal Search" of the Machine Learning Professional Series in Python (Chapter 1.3 of the book, around Figure 1.4). An example where the background data generation model may not be understood from the apparent correlation.
--Common to all models
--follows a normal distribution with mean 0 and variance 1
-$ e_x $ and $ e_y $ are noises such that the variables $ x and y $ after being used for sum become variance 1 (mean is 0).
--Model 1
-
# -*- coding: utf-8 -*-
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def gendata(N=100, verbose=False):
# Model 1
z1 = np.random.normal(0.0, 1.0, N)
ex1 = np.random.normal(0.0, np.sqrt(1-0.3**2), N)
ey1 = np.random.normal(0.0, np.sqrt(1-0.7**2-0.3**), N)
x1 = 0.3 * z1 + ex1
y1 = 0.7 * x1 + 0.3 * z1 + ey1
if verbose:
print("model 1")
print(np.std(x1))
print(np.std(y1))
# Model 2
z2 = np.random.normal(0.0, 1.0, N)
ex2 = np.random.normal(0.0, np.sqrt(1-0.7**2-0.3**), N)
ey2 = np.random.normal(0.0, np.sqrt(1-0.3**2), N)
y2 = 0.3 * z2 + ey2
x2 = 0.7 * y2 + 0.3 * z2 + ex2
if verbose:
print("model 2")
print(np.std(x2))
print(np.std(y2))
# Model 3
z3 = np.random.normal(0.0, 1.0, N)
ex3 = np.random.normal(0.0, np.sqrt(1.0-0.89**2), N)
ey3 = np.random.normal(0.0, np.sqrt(1.0-0.89**2), N)
x3 = 0.89 * z3 + ex3
y3 = 0.89 * z3 + ey3
if verbose:
print("model 3")
print(np.std(x3))
print(np.std(y3))
return x1, y1, x2, y2, x3, y3
if __name__ == '__main__':
for n in [10, 100, 1000, 10000]:
x1, y1, x2, y2, x3, y3 = gendata(n)
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.plot(x1, y1, "ro")
plt.subplot(1, 3, 2)
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.plot(x2, y2, "ro")
plt.subplot(1, 3, 3)
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.plot(x3, y3, "ro")
plt.savefig("N{}.png ".format(n))
Recommended Posts