Descriptive and inferential statistics are often used in data analysis during the understanding of data before building machine learning models. This time, in order to understand the data, I read the CSV data with the dataframe of pandas and created the source code to easily apply descriptive statistics and statistical inference.
Create CSV data as input. The data used to create the source code is shown below.
x1,x2,x3,x4,x5
1,11,1,1,1
2,12,1,1,2
3,13,1,1,3
4,14,1,1,4
5,150,1,1,5
5,150,1,1,5
4,160,1,1,4
3,180,1,1,3
2,180,1,1,2
1,190,2,2,2
The modularized source code is shown below.
class StatisticalTests():
def __init__(self):
pass
@classmethod
def basic_info(cls, df):
print('Basic statistics------------------start')
print('df.head(3)-------------')
print(df.head(3))
print('df.dtypes-------------')
print(df.dtypes)
print('df.describe(include=\'all\')-------------')
print(df.describe(include='all'))
@classmethod
def t_interval(cls, df):
print('Mother mean 95%Confidence interval-------------------start')
for column_name, s in df.iteritems():
u2 = s.var(ddof=1) #Population variance estimate (unbiased variance)
m = s.mean() #Specimen average
n = len(s)-1 #Degree of freedom
se = math.sqrt(u2/len(s)) #Standard error
ci1, ci2 = st.t.interval(alpha=0.95, loc=m, scale=se, df=n)
print(f'Column name= {column_name} //Mother mean 95%Confidence interval CI= '
f'[{ci1:.2f} , {ci2:.2f}] //Specimen average[{m}]')
@classmethod
def shapiro(cls, df):
print('Shapiro-Wilk test(Test of normality)------------------start')
for column_name, s in df.iteritems():
_, p = st.shapiro(s)
if p >= 0.05:
print(f'Column name= {column_name} //p-value= {p:.3f} '
f'//Test result:Adopting the null hypothesis, it cannot be said that there is no normality')
else:
print(f'Column name= {column_name} //p-value= {p:.3f} '
f'//Test result:Reject the null hypothesis, no normality')
@classmethod
def levene(cls, xa, xb):
print('Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start')
_, p = st.levene(xa, xb, center='mean')
if p >= 0.05:
print(f'p-value= {p:.3f} //Test result:Adopting the null hypothesis, it cannot be said that the two samples are not homoscedastic.')
else:
print(f'p-value= {p:.3f} //Test result:Rejecting the null hypothesis, the two samples are not homoscedastic')
@classmethod
def ttest_rel(cls, xa, xb):
print('Between 2 groups:Corresponding t-test-------------------start')
#The null hypothesis is that there is no significant difference between the mean values of the two samples.
#If there is a response, check the same person before and after administration of the drug like Mr. A and Mr. B
t, p = st.ttest_rel(xa, xb)
if np.sign(t) == -1:
a = xa
xa = xb
xb = a
t, p = st.ttest_rel(xa, xb)
mu = abs(xa.mean()-xb.mean())
se = mu/t
n = len(xa)+len(xb)-2
ci1, ci2 = st.t.interval(alpha=0.95, loc=mu, scale=se, df=n)
if p >= 0.05:
print(f'p-value={p:.3f} //t value= {t:.2f}')
print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
print('//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.')
else:
print(f'p-value={p:.3f} //t value= {t:.2f}')
print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
print(f'//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples')
@classmethod
def ttest_ind_equal_var_true(cls, xa, xb):
print('Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start')
#The null hypothesis is that there is no significant difference between the mean values of the two samples.
#Without correspondence, do not check the same person before and after administration of medicine like Mr. A and Mr. B
t, p = st.ttest_ind(xa, xb, equal_var=True)
if np.sign(t) == -1:
a = xa
xa = xb
xb = a
t, p = st.ttest_ind(xa, xb, equal_var=True)
cls._ttest_ind(t, p, xa, xb)
@classmethod
def ttest_ind_equal_var_false(cls, xa, xb):
print('Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start')
#The null hypothesis is that there is no significant difference between the mean values of the two samples.
#Without correspondence, do not check the same person before and after administration of medicine like Mr. A and Mr. B
t, p = st.ttest_ind(xa, xb, equal_var=False)
if np.sign(t) == -1:
a = xa
xa = xb
xb = a
t, p = st.ttest_ind(xa, xb, equal_var=False)
cls._ttest_ind(t, p, xa, xb)
@classmethod
def _ttest_ind(cls, t, p, xa, xb):
mu = abs(xa.mean()-xb.mean())
se = mu/t
n = len(xa)+len(xb)-2
ci1, ci2 = st.t.interval(alpha=0.95, loc=mu, scale=se, df=n)
if p >= 0.05:
print(f'p-value={p:.3f} //t value= {t:.2f}')
print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
print('//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.')
else:
print(f'p-value={p:.3f} //t value= {t:.2f}')
print(f'//Difference in mean= {mu:.2f} //Standard error of difference= {se:.2f}')
print(f'//95 of mean difference%Confidence interval CI= [{ci1:.2f} , {ci2:.2f}]')
print(f'//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples')
@classmethod
def chisquare(cls, sample, answer):
print('Goodness of fit test-------------------start')
#Alternative hypothesis: The data obtained do not fit the theoretical distribution.
sample = sample.tolist()
answer = answer.tolist()
p = st.chisquare(sample, f_exp=answer)[1]
if p >= 0.05:
print(f'p-value= {p:.3f} //Test result:It is not possible to adopt the null hypothesis and conclude that it does not fit the theoretical distribution.')
else:
print(f'p-value= {p:.3f} //Test result:We reject the null hypothesis and conclude that it does not fit the theoretical distribution.')
@classmethod
def chi2_contingency(cls, df):
print('Test of independence-------------------start')
# Usage)
#Number of carcinogens Number of non-carcinogens
#Smoking group 30 70
#Non-smoking group 20 80
# print(st.chi2_contingency(x))
p = st.chi2_contingency(df.values)[1]
if p >= 0.05:
print(f'p-value= {p:.3f} //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.')
else:
print(f'p-value= {p:.3f} //Test result:Rejecting the null hypothesis, we conclude that the two variables are not independent.')
@classmethod
def pearsonr(cls, xa, xb):
print('Test of correlation coefficient-------------------start')
#Make a null hypothesis and an alternative hypothesis:The null hypothesis is ρ=0, i.e. population correlation=0
#The alternative hypothesis is "ρ ≠ 0", that is, the population correlation ≠ 0
x1 = xa.values
x2 = xb.values
s = st.pearsonr(x1, x2)
if s[1] >= 0.05:
print(f'Correlation coefficient= {s[0]:.3f} //p-value= {s[1]:.3f} //Test result:Adopt the null hypothesis. It cannot be said that there is a correlation.')
else:
print(f'Correlation coefficient= {s[0]:.3f} //p-value= {s[1]:.3f} //Test result:Reject the null hypothesis. There is a correlation.')
An execution example of the above modularized source code is shown below. You can see that you can understand the CSV data.
Basic statistics------------------start
df.head(3)-------------
x1 x2 x3 x4 x5
0 1 11 1 1 1
1 2 12 1 1 2
2 3 13 1 1 3
df.dtypes-------------
x1 int64
x2 int64
x3 int64
x4 int64
x5 int64
dtype: object
df.describe(include='all')-------------
x1 x2 x3 x4 x5
count 10.000000 10.000000 10.000000 10.000000 10.00000
mean 3.000000 106.000000 1.100000 1.100000 3.10000
std 1.490712 81.493013 0.316228 0.316228 1.37032
min 1.000000 11.000000 1.000000 1.000000 1.00000
25% 2.000000 13.250000 1.000000 1.000000 2.00000
50% 3.000000 150.000000 1.000000 1.000000 3.00000
75% 4.000000 175.000000 1.000000 1.000000 4.00000
max 5.000000 190.000000 2.000000 2.000000 5.00000
Mother mean 95%Confidence interval-------------------start
Column name= x1 //Mother mean 95%Confidence interval CI= [1.93 , 4.07] //Specimen average[3.0]
Column name= x2 //Mother mean 95%Confidence interval CI= [47.70 , 164.30] //Specimen average[106.0]
Column name= x3 //Mother mean 95%Confidence interval CI= [0.87 , 1.33] //Specimen average[1.1]
Column name= x4 //Mother mean 95%Confidence interval CI= [0.87 , 1.33] //Specimen average[1.1]
Column name= x5 //Mother mean 95%Confidence interval CI= [2.12 , 4.08] //Specimen average[3.1]
Shapiro-Wilk test(Test of normality)------------------start
Column name= x1 //p-value= 0.341 //Test result:Adopting the null hypothesis, it cannot be said that there is no normality
Column name= x2 //p-value= 0.004 //Test result:Reject the null hypothesis, no normality
Column name= x3 //p-value= 0.000 //Test result:Reject the null hypothesis, no normality
Column name= x4 //p-value= 0.000 //Test result:Reject the null hypothesis, no normality
Column name= x5 //p-value= 0.410 //Test result:Adopting the null hypothesis, it cannot be said that there is no normality
Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start
p-value= 0.000 //Test result:Rejecting the null hypothesis, the two samples are not homoscedastic
Between 2 groups:Mother mean 95%Test of homoscedasticity by Levene test-------------------start
p-value= 0.813 //Test result:Adopting the null hypothesis, it cannot be said that the two samples are not homoscedastic.
Between 2 groups:Corresponding t-test-------------------start
p-value=0.003 //t value= 4.01
//Difference in mean= 103.00 //Standard error of difference= 25.70
//95 of mean difference%Confidence interval CI= [49.01 , 156.99]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:Corresponding t-test-------------------start
p-value=0.343 //t value= 1.00
//Difference in mean= 0.10 //Standard error of difference= 0.10
//95 of mean difference%Confidence interval CI= [-0.11 , 0.31]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start
p-value=0.001 //t value= 4.00
//Difference in mean= 103.00 //Standard error of difference= 25.77
//95 of mean difference%Confidence interval CI= [48.85 , 157.15]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:No support(Between 2 groupsに等分散性あり)t-test-------------------start
p-value=0.878 //t value= 0.16
//Difference in mean= 0.10 //Standard error of difference= 0.64
//95 of mean difference%Confidence interval CI= [-1.25 , 1.45]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start
p-value=0.003 //t value= 4.00
//Difference in mean= 103.00 //Standard error of difference= 25.77
//95 of mean difference%Confidence interval CI= [48.85 , 157.15]
//Test result:Rejecting the null hypothesis, there is a significant difference in the mean of the two samples
Between 2 groups:No support(Between 2 groupsに等分散性なし)t-test-------------------start
p-value=0.878 //t value= 0.16
//Difference in mean= 0.10 //Standard error of difference= 0.64
//95 of mean difference%Confidence interval CI= [-1.25 , 1.45]
//Test result:Adopting the null hypothesis, it cannot be said that there is a significant difference between the mean values of the two samples.
Goodness of fit test-------------------start
p-value= 0.000 //Test result:We reject the null hypothesis and conclude that it does not fit the theoretical distribution.
Goodness of fit test-------------------start
p-value= 1.000 //Test result:It is not possible to adopt the null hypothesis and conclude that it does not fit the theoretical distribution.
Test of independence-------------------start
p-value= 0.142 //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.
Test of independence-------------------start
p-value= 0.000 //Test result:Rejecting the null hypothesis, we conclude that the two variables are not independent.
Test of independence-------------------start
p-value= 1.000 //Test result:Adopting the null hypothesis, we cannot conclude that the two variables are not independent.
Test of correlation coefficient-------------------start
Correlation coefficient= 0.165 //p-value= 0.649 //Test result:Adopt the null hypothesis. It cannot be said that there is a correlation.
Test of correlation coefficient-------------------start
Correlation coefficient= 0.979 //p-value= 0.000 //Test result:Reject the null hypothesis. There is a correlation.
--It was confirmed that CSV data can be read into pandas dataframe and descriptive statistics and statistical inference can be easily applied. ――It was found that CSV data can be understood by applying descriptive statistics and statistical inference.
-Comparison of test results of 2 classes of t-test with Python -Summary of statistical hypothesis test using python, confidence interval estimation method