In the hypergeometric distribution
On the other hand, in the binomial distribution
The hypergeometric distribution can be well approximated by the binomial distribution if the total number (the number of balls in the box) is sufficiently larger than the number of extracts.
The binomial distribution is
import scipy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
# functions ------------------------
def binomial(n, x, p):
'''
Binomial distribution
Event with probability p occurs x times in n trials
Each trial is independent
Example: An event where 3 is rolled on the dice (probability p)=1/6) but n dice=X when shaken 10 times=5 times)
nCx * p^x * (1-p)^(n-x)
'''
return (
sc.special.comb(n, x)
* np.power(p, x)
* np.power(1 - p, n - x)
)
def poisson(n, x, p):
'''
Poisson distribution
p->When 0, the binomial distribution becomes a Poisson distribution.
'''
mean = n * p
return (
np.power(mean, x)
* np.exp(-mean)
/ sc.special.factorial(x)
)
def gaussian(n, x, p):
'''
normal distribution
n->When oo, the binomial distribution is normal
'''
mean = n * p
variance = np.power(n * p * (1 - p), 1/2)
return (
1.0
/ (np.power(2 * np.pi, 1/2) * variance)
/ np.exp(
np.power(x - mean, 2)
/ (2 * np.power(variance, 2))
)
)
def hypergeometric(n, x, p, N):
'''
Hypergeometric distribution
Each trial is not independent
It is necessary to consider all N for non-restoration extraction
'''
target_size = N * p #Number of hits in the whole
return (
sc.special.comb(target_size, x) #X combinations per
* sc.special.comb(N - target_size, n - x) #Lost n-x combinations
/ sc.special.comb(N, n) #All combinations of n
)
def calc_distributions(n, p, cols, N=0):
'''
ex.:
n = 30 #Number of trials
p = 0.16 #Percentage of hits
N = 1000 #All (used only in hypergeometric distributions)
'''
#Prepare an array for storing results
bi_arr = np.zeros(n + 1, dtype=np.float)
po_arr = np.zeros(n + 1, dtype=np.float)
ga_arr = np.zeros(n + 1, dtype=np.float)
for x in np.arange(n + 1):
#Store the result in an array
bi_arr[x] = binomial(n, x, p)
po_arr[x] = poisson(n, x, p)
ga_arr[x] = gaussian(n, x, p)
#Store the result in a data frame
df = pd.DataFrame()
df['Binomial distribution'] = bi_arr
df['Poisson distribution'] = po_arr
df['normal distribution'] = ga_arr
if N > 0:
hy_arr = np.zeros(n + 1, dtype=np.float)
for x in np.arange(n + 1):
hy_arr[x] = hypergeometric(n, x, p, N)
df['Hypergeometric distribution'] = hy_arr
return df[cols]
def visualize(df,n, p, N=0):
plot_params = {
'linestyle':'-',
'markersize':5,
'marker':'o'
}
colors = [
'black',
'blue',
'green',
'purple',
]
plt.close(1)
figure_ = plt.figure(1, figsize=(8,4))
axes_ = figure_.add_subplot(111) #Create Axes
for i, col in enumerate(df.columns):
if i == 0:
plot_params['linewidth'] = 2
plot_params['alpha'] = 1.0
else:
plot_params['linewidth'] = 1
plot_params['alpha'] = 0.4
plot_params['color'] = colors[i]
axes_.plot(
df.index.values, df[col].values,
label = col,
**plot_params,
)
plt.legend()
plt.xlabel('Number of hits x')
plt.ylabel('Probability of winning x times out of n times')
title = 'Percentage of hits p:{p:.2f},Number of trials n:{n}'.format(p=p, n=n)
if 'Hypergeometric distribution' in df.columns:
title += ',All N:{N}'.format(N=N)
plt.title(title)
xmax = n * p * 2
if xmax<10: xmax = 10;
plt.xlim([0, xmax])
#(The expression "number of times" is not good considering non-restoration extraction)
n = 100; p = 0.167
df = calc_distributions(n, p, cols=['Binomial distribution', 'Poisson distribution', 'normal distribution'])
visualize(df, n, p)
Binomial distribution and normal distribution match well
n = 100; p = 0.01
df = calc_distributions(n, p, cols=['Binomial distribution', 'Poisson distribution', 'normal distribution'])
visualize(df, n, p)
Binomial distribution and Poisson distribution match well
n = 100; p = 0.1; N = 5000
df = calc_distributions(n, p, cols=['Hypergeometric distribution','Binomial distribution'], N=N)
visualize(df, n, p, N)
Hypergeometric distribution and binomial distribution match well
n = 100; p = 0.1; N = 500
df = calc_distributions(n, p, cols=['Hypergeometric distribution','Binomial distribution'], N=N)
visualize(df, n, p, N)
The error is noticeable compared to the case of N = 5000
end
Recommended Posts