Overview

I would like to create a ** histogram ** with Python + pandas + matplotlib and overlay the ** normal distribution curve ** obtained assuming that the population follows a normal distribution.

In addition, ** Shapiro-Wilk test ** (Shapiro-Wilk test) ** normality check **, normal distribution assumed ** interval estimation for population mean ** (95% confidence interval) I will also do).

Here, as an example, I would like to create a histogram for the test score (for 100 people) out of 100 points.

■ Average: 76.1 points, standard deviation: 10.3 points --p = 0.77 (p> = 0.05) and it can be said that the population has normality --95% confidence interval for population mean CI = [74.09, 78.19]

Related entries

--The normality check is explained in "Comparison of test results of 2 classes of t-test with Python".

Execution environment

We have confirmed the execution and operation with Google Colab. (Python 3.6.9). It's almost the same as Jupyter Notebook.

!pip list 
matplotlib               3.1.2
numpy                    1.17.4
pandas                   0.25.3
scipy                    1.3.3

Preparing to use Japanese with matplotlib

Make Japanese available in the output diagram of matplotlib.

!pip install japanize-matplotlib
import japanize_matplotlib

With the above, japanize-matplotlib-1.0.5 will be installed and imported, and even if you use Japanese for labels etc., the characters will not be garbled (tofu).

code

`python`


%reset -f
import numpy as np
import pandas as pd
import scipy.stats as st
import math                        
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 
import matplotlib.transforms as ts 

#Generate dummy data of test results (normal random numbers (integers) in the range of 0 to 100)
def getTestData(mu,sig,n) : #Mean mu, standard deviation sig, number of normal random numbers n
  f = lambda x: min(100,max(0,round( np.random.normal(mu,sig),0)))
  return (np.frompyfunc(f, 1, 1)(np.zeros(n))).astype(np.float)

df = pd.DataFrame( {'p1':getTestData(75,8,40)} )

#Graph drawing parameters
target = 'p1'           #Columns to plot in the data frame
x_min, x_max = 40, 100  #Score range to plot (lower and upper limits)
j = 5                   #Y-axis (frequency) step size
k = 5                   #Section width
bins = 12               #Number of sections(x_max-x_min)/k  (100-40)/5->12

#Graph drawing process from here
plt.figure(dpi=96)
plt.xlim(x_min,x_max)
d = 0.001

# (1)Statistical processing
n   = len(df[target])         #Specimen size
mu  = df[target].mean()       #average
sig = df[target].std(ddof=0)  #Standard deviation: ddof(Degree of freedom)=0
print(f'■ Average:{mu:.1f}Point, standard deviation:{sig:.1f}point')
ci1, ci2 = (None, None)

#Test of normality (significance level 5)%) And the population mean 95%Confidence interval
_, p = st.shapiro(df[target])
if p >= 0.05 :
  print(f'  - p={p:.2f} ( p>=0.05 )And it can be said that the population has normality')
  U2 = df[target].var(ddof=1)  #Population variance estimate (unbiased variance)
  DF = n-1                     #Degree of freedom
  SE = math.sqrt(U2/n)         #Standard error
  ci1,ci2 = st.t.interval( alpha=0.95, loc=mu, scale=SE, df=DF )
  print(f'  -Mother mean 95%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
else:
  print(f'  ※ p={p:.2f} ( p<0.05 )And the population cannot be said to be normal')

# (2)Histogram drawing
hist_data = plt.hist(df[target], bins=bins, color='tab:cyan', range=(x_min, x_max), rwidth=0.9)
plt.gca().set_xticks(np.arange(x_min,x_max-k+d, k))

# (3)Approximate curve assuming a normal distribution
sig = df[target].std(ddof=1)  #Unbiased standard deviation: ddof(Degree of freedom)=1
nx = np.linspace(x_min, x_max+d, 150) #150 divisions
ny = st.norm.pdf(nx,mu,sig) * k * len(df[target])
plt.plot( nx , ny, color='tab:blue', linewidth=1.5, linestyle='--')

# (4)X-axis scale / label setting
plt.xlabel('Test score classification',fontsize=12)
plt.gca().set_xticks(np.arange(x_min,x_max+d, k))

# (5)Y-axis scale / label setting
y_max = max(hist_data[0].max(), st.norm.pdf(mu,mu,sig) * k * len(df[target]))
y_max = int(((y_max//j)+1)*j) #The smallest multiple of j that is greater than the maximum frequency
plt.ylim(0,y_max)
plt.gca().set_yticks( range(0,y_max+1,j) )
plt.ylabel('Number of people',fontsize=12)

# (6)Text output of mean and standard deviation
tx = 0.03 #For character output position adjustment
ty = 0.91 #For character output position adjustment
tt = 0.08 #For character output position adjustment
tp = dict( horizontalalignment='left',verticalalignment='bottom',
           transform=plt.gca().transAxes, fontsize=11 )
plt.text( tx, ty, f'Average score{mu:.2f}', **tp)
plt.text( tx, ty-tt, f'standard deviation{sig:.2f}', **tp)
plt.vlines( mu, 0, y_max, color='black', linewidth=1 )

Code 2

In the case of a histogram, it may be difficult to tell whether the value that is the boundary of the division is counted in the left or right frequency. Below is a simpler version of it.

`python`


%reset -f
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.transforms as ts
import math

#Generate test dummy data (0-100)
def getTestData(mu,sig,n) : #Mean mu, standard deviation sig, number of normal random numbers n
  f = lambda x: min(100,max(0,round( np.random.normal(mu,sig),0)))
  return (np.frompyfunc(f, 1, 1)(np.zeros(n))).astype(np.float)

df = pd.DataFrame( {'p1':dm} )

#Graph drawing parameters
target = 'p1'           #Columns to plot in the data frame
x_min, x_max = 40, 100  #Score range to plot (lower and upper limits)
j = 5                   #Y-axis (frequency) step size
k = 5                   #Section width
bins = 12               #Number of sections(x_max-x_min)/k  (100-40)/5->12

#Graph drawing process from here
plt.figure(dpi=96)
plt.xlim(x_min-k/2,x_max-k/2)
d = 0.001

# (1)Statistical processing
n   = len(df[target])         #Specimen size
mu  = df[target].mean()       #average
sig = df[target].std(ddof=0)  #Standard deviation: ddof(Degree of freedom)=0
print(f'■ Average:{mu:.1f}Point, standard deviation:{sig:.1f}point')
ci1, ci2 = (None, None)

#Test of normality (significance level 5)%) And the population mean 95%Confidence interval
_, p = st.shapiro(df[target])
if p >= 0.05 :
  print(f'  - p={p:.2f} ( p>=0.05 )And it can be said that the population has normality')
  U2 = df[target].var(ddof=1)  #Population variance estimate (unbiased variance)
  DF = n-1                     #Degree of freedom
  SE = math.sqrt(U2/n)         #Standard error
  ci1,ci2 = st.t.interval( alpha=0.95, loc=mu, scale=SE, df=DF )
  print(f'  -Mother mean 95%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
else:
  print(f'  ※ p={p:.2f} ( p<0.05 )And the population cannot be said to be normal')

# (2)Histogram drawing
hist_data = plt.hist(df[target], bins=bins, color='tab:cyan', range=(x_min, x_max), rwidth=0.9, align='left')
plt.gca().set_xticks(np.arange(x_min,x_max-k+d, k))

# (3)Approximate curve assuming a normal distribution
sig = df[target].std(ddof=1)  #Unbiased standard deviation: ddof(Degree of freedom)=1
nx = np.linspace(x_min, x_max+d, 150) #150 divisions
ny = st.norm.pdf(nx,mu,sig) * k * len(df[target])
plt.plot( nx - k/2 , ny, color='tab:blue', linewidth=1.5, linestyle='--')

# (4)X-axis scale / label setting
plt.xticks(rotation=90)
plt.xlabel('Test score classification',fontsize=12)
tmp = list([ f'${i} \\leq x < {i+k}$' for i in range(x_min,x_max-k+1, k) ]) 
tmp[-1] = f'${x_max-k} \\leq x \\leq {x_max}$'
plt.gca().set_xticklabels( tmp )

# (5)Y-axis scale / label setting
y_max = max(hist_data[0].max(), st.norm.pdf(mu,mu,sig) * k * len(df[target]))
y_max = int(((y_max//j)+1)*j) #The smallest multiple of j that is greater than the maximum frequency
plt.ylim(0,y_max)
plt.gca().set_yticks( range(0,y_max+1,j) )
plt.ylabel('Number of people',fontsize=12)

I want to create a histogram and overlay the normal distribution curve on it. matplotlib edition

Overview

Related entries

Execution environment

Preparing to use Japanese with matplotlib

code

python

Code 2

python

`python`

`python`