In this article, I will try to draw various graphs using the Python graph creation library Altair. Altair is characterized by inputting data with Pandas DataFrame.
In this paper, we used the Titanic Passenger Database published on Kaggle. The data format is as follows.
train.csv
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
The way to read the data is shown below. Reference
Can be installed with pip.
Terminal
pip install altair
Altair is the best library for creating scatter plots. Even numerical data is processed as category data by adding : O
.
altair_demo.py
import os
import altair as alt
import pandas as pd
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
scatter_plot = alt.Chart(df).mark_circle().encode(
x=alt.X('Age'),
y=alt.Y('Fare'),
column=alt.Column('Survived:O'),
color=alt.Color('Sex', sort=['male', 'female']),
tooltip=['Age', 'Fare', 'Name'],
size=alt.Size('Pclass:O')
).properties(
width=600,
height=500
).interactive()
scatter_plot.show()
When drawing a line segment, you can connect it by creating the coordinates of the start point and end point in DataFrame. The intercept and slope of the linear regression are determined by sckit-learn.
altair_demo.py
import os
import altair as alt
import pandas as pd
from sklearn.linear_model import LinearRegression
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
#Delete the row containing the missing value
linear_df = df.dropna(subset=['Age', 'Fare'], how='any', axis=0)
#Create a linear regression model
linear = LinearRegression(
).fit(linear_df['Age'].values.reshape(-1,1), linear_df['Fare'].values.reshape(-1,1))
#Parameter determination
a = linear.coef_[0]
b = linear.intercept_
#Threshold determination
x_min = df['Age'].min()
x_max = df['Age'].max()
#Creating a data frame
linear_points = pd.DataFrame({
'Age': [x_min, x_max],
'Fare': [a*x_min+b, a*x_max+b],
}).astype(float)
linear_line = alt.Chart(linear_points).mark_line(color='steelblue').encode(
x=alt.X('Age'),
y=alt.Y('Fare')
).properties(
width=500,
height=500
).interactive()
linear_line.show()
It is also possible to display it on top of the scatter plot.
altair_demo.py
import os
import altair as alt
import pandas as pd
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
scatter_plot = alt.Chart(df).mark_circle(size=50).encode(
x=alt.X('Age'),
y=alt.Y('Fare'),
).properties(
width=500,
height=500
).interactive()
linear_line =Same as above (omitted)
(scatter_plot + linear_line).show()
altair_demo.py
import os
import altair as alt
import pandas as pd
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
boxplot = alt.Chart(df.dropna(subset=['Embarked'], how='any', axis=0)).mark_boxplot().encode(
x=alt.X('Survived:O'),
y=alt.Y('Fare'),
column=alt.Column('Embarked', sort=['S','Q','C']),
color=alt.Color('Sex', sort=['male', 'female'])
).properties(
width=600,
height=500
).interactive()
boxplot.show()
By setting the Y-axis to count ()
, it will count the elements. You can set bin with ʻalt.X ()
`.
altair_demo.py
import os
import altair as alt
import pandas as pd
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
histgram = alt.Chart(df).mark_bar().encode(
x=alt.X("Age", bin=alt.Bin(step=10,extent=[0,90])),
y=alt.Y('count()'),
column=alt.Column('Survived:O'),
color=alt.Color('Sex', sort=['male', 'female']),
opacity=alt.Opacity('Sex', sort=['male', 'female'])
).properties(
width=600,
height=500
).interactive()
histgram.show()
You can save the created figure as html by installing the following package.
Terminal
pip install altair_saver
Add .interactive ()
to make the graph move freely. This property is preserved in the saved html.
altair_demo.py
import os
import altair as alt
import pandas as pd
cwd = os.getcwd()
path = ['train.csv']
file = os.path.join(cwd, *path)
df = pd.read_table(file, sep=',', index_col=0 ,header=0)
boxplot = alt.Chart(df.dropna(subset=['Embarked'], how='any', axis=0)).mark_boxplot().encode(
x=alt.X('Survived:O'),
y=alt.Y('Fare')).interactive()
boxplot.save(fp=boxplot.html)
If you want to save in a format other than .html
, you can refer to here.
By combining with Streamlit, you can create various data analysis applications.
Recommended Posts