Make a note of frequently used code in EDA (Exploratory Data Analysis), which is performed at the beginning of data analysis. This time, in particular, we assume the case of classification problems (such as the prediction of passenger survival on the Titanic).
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
df = sns.load_dataset("titanic")
df = df.replace(float("nan"),np.nan) #Later unique()For the calculation of
df
for colname in df.columns:
uni = len(df[colname].unique())
print("{0:<20} : {1}".format(colname, uni))
target="survived"
cate_list = ["pclass", "sex", "sibsp", "parch", "embarked", "class",
"who", "adult_male", "deck", "embark_town", "alone"] #Ignore alive
num_list = ["age", "fare"]
all_list = cate_list + num_list
See the article here
If you just want to check it easily, use the following two types.
sns.countplot(x="pclass", hue=target, data=df)
sns.catplot(x="pclass", hue=target, data=df,kind="count")
Furthermore, if you want to know about NaN and average, define and use the following function.
def category_plot(x, hue, data, order=[]):
#NaN to string
flag_nan = False
data[x] = data[x].astype("str").replace("nan","NaN")
if "NaN" in data[x].values:
flag_nan = True
x_unique_list = sorted(data[x].unique())
x_unique_len = len(x_unique_list)
x_unique_len_dropna = x_unique_len-1 if flag_nan else x_unique_len
hue_unique_list = sorted(data[hue].unique())
hue_unique_len = len(hue_unique_list)
if order==[]:
if flag_nan:
order = x_unique_list
order.remove("NaN")
order = order + ["NaN"]
else:
order = x_unique_list
else:
pass
colors = plt.get_cmap("tab10").colors
sns.countplot(x=x, hue=hue, data=data, order=order,hue_order=hue_unique_list)
for i,ui in enumerate(hue_unique_list):
h = data.loc[data[hue]==ui,:].shape[0] / x_unique_len_dropna
plt.plot([0-0.5,x_unique_len_dropna-1+0.5],[h,h],color=colors[i], linestyle="dashed", label="{0} (average)".format(ui))
plt.legend()
plt.show()
category_plot(x="pclass", hue=target, data=df)
category_plot(x="embarked", hue=target, data=df)
category_plot(x="deck", hue=target, data=df)
If you look at the following two types of plots, it's almost OK
sns.catplot(x=target, y="age", data=df,kind="swarm")
sns.catplot(x=target, y="age", data=df,kind="violin")
seaborn:seaborn.catplot seaborn:seaborn.countplot
Recommended Posts