When analyzing data, I think you will use graphs to visualize the data. At that time, it would be convenient if the statistics showing the correlation between the two variables could be displayed at the same time. Therefore, we have made it possible to display the appropriate statistics on the appropriate graph according to the contents of the variable (category or numerical value).
Here is a summary of the appropriate graphing methods for each variable content and the statistics that represent the correlations that I have covered so far. Please see the link below for details. Visualization method of data by explanatory variable and objective variable How to find the correlation for categorical variables
Modify the previously created method "Draw the right graph according to the content of the variable (category or number)" and put the right statistic on the right graph (see: pandas DataFrame is right I made a method to automatically select and visualize various graphs).
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
def visualize_data(data, target_col, categorical_keys=None):
keys=data.keys()
if categorical_keys is None:
categorical_keys=keys[[is_categorical(data, key) for key in keys]]
for key in keys:
if key==target_col:
continue
length=10
subplot_size=(length, length/2)
if (key in categorical_keys) and (target_col in categorical_keys):
r=cramerV(key, target_col, data)
fig, axes=plt.subplots(1, 2, figsize=subplot_size)
sns.countplot(x=key, data=data, ax=axes[0])
sns.countplot(x=key, data=data, hue=target_col, ax=axes[1])
plt.title(r)
plt.tight_layout()
plt.show()
elif (key in categorical_keys) and not (target_col in categorical_keys):
r=correlation_ratio(cat_key=key, num_key=target_col, data=data)
fig, axes=plt.subplots(1, 2, figsize=subplot_size)
sns.countplot(x=key, data=data, ax=axes[0])
sns.violinplot(x=key, y=target_col, data=data, ax=axes[1])
plt.title(r)
plt.tight_layout()
plt.show()
elif not (key in categorical_keys) and (target_col in categorical_keys):
r=correlation_ratio(cat_key=target_col, num_key=key, data=data)
fig, axes=plt.subplots(1, 2, figsize=subplot_size)
sns.distplot(data[key], ax=axes[0], kde=False)
g=sns.FacetGrid(data, hue=target_col)
g.map(sns.distplot, key, ax=axes[1], kde=False)
axes[1].set_title(r)
axes[1].legend()
plt.tight_layout()
plt.close()
plt.show()
else:
r=data.corr().loc[key, target_col]
sg=sns.jointplot(x=key, y=target_col, data=data, height=length*2/3)
plt.title(r)
plt.show()
In addition, the following method is used on the way.
def is_categorical(data, key): #Determine if it is a categorical variable
col_type=data[key].dtype
if col_type=='int':
nunique=data[key].nunique()
return nunique<6
elif col_type=="float":
return False
else:
return True
def correlation_ratio(cat_key, num_key, data): #Find the correlation ratio
categorical=data[cat_key]
numerical=data[num_key]
mean=numerical.dropna().mean()
all_var=((numerical-mean)**2).sum()
unique_cat=pd.Series(categorical.unique())
unique_cat=list(unique_cat.dropna())
categorical_num=[numerical[categorical==cat] for cat in unique_cat]
categorical_var=[len(x.dropna())*(x.dropna().mean()-mean)**2 for x in categorical_num]
r=sum(categorical_var)/all_var
return r
def cramerV(x, y, data): #Find the number of correlations
table=pd.crosstab(data[x], data[y])
x2, p, dof, e=st.chi2_contingency(table, False)
n=table.sum().sum()
r=np.sqrt(x2/(n*(np.min(table.shape)-1)))
return r
Let's apply it to titanic data (only part of the result is shown).
train_data=pd.read_csv("train.csv")
train_data=train_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
categories=["Survived", "Pclass", "Sex", "Embarked"]
visualize_data(train_data, "Survived", categories)
I tried to summarize the methods I have made so far. Now you can visualize the data and understand the correlation at once. The source code is on github, so feel free to use it!
Recommended Posts