Pandas is a library that can easily handle tabular data. Here, I will focus on the minimum points you need to know about Pandas. It is supposed to use Python3 series.
You can load the library using ʻimport. By convention, it is often abbreviated as
pd`.
Pandas_1.py
import pandas as pd
Series
Series is a dictionary-type list-like data.
Pandas_2.py
import pandas as pd
series_olympic = pd.Series({'Tokyo': 2020, 'Rio de Janeiro': 2016, 'London': 2012})
print(series_olympic)
Pandas_3.py
import pandas as pd
series_olympic = pd.Series({'Tokyo': 2020, 'Rio de Janeiro': 2016, 'London': 2012})
print(series_olympic[0:2])
print(series_olympic.index) #Extract only the index.
print(series_olympic.values) #Extract only the value.
print(series_olympic[series_olympic % 8 == 0]) #Extract only the elements that satisfy the conditions.
Pandas_4.py
import pandas as pd
series_olympic = pd.Series({'Tokyo': 2020, 'Rio de Janeiro': 2016, 'London': 2012})
series_olympic = series_olympic.append(pd.Series({'Beijing': 2008})) #Add a new element.
print(series_olympic)
series_olympic = series_olympic.drop('Tokyo') #Delete the element.
print(series_olympic)
Pandas_5.py
import pandas as pd
series_olympic = pd.Series({'Tokyo': 2020, 'Rio de Janeiro': 2016, 'London': 2012})
print(series_olympic.sort_index()) #Sort in ascending order of index.
print(series_olympic.sort_values()) #Sort in ascending order of value.
print(series_olympic.sort_values(ascending=False)) #Sort by value in descending order.
DataFrame
DataFrame is tabular data that joins Series.
Pandas_6.py
import pandas as pd
series_name = pd.Series(['Ichiro', 'Jiro', 'Saburo'])
series_height = pd.Series([200, 173, 141])
series_weight = pd.Series([100, 72, 40])
df_humans = pd.DataFrame({'name': series_name, 'height': series_height, 'weight': series_weight})
print(df_humans)
df_humans.index = ['Ichiro', 'Jiro', 'Saburo'] #Give a line name.
df_humans.columns = ['name', 'height', 'body weight'] #Give a column name.
print(df_humans)
df_humans_empty = pd.DataFrame(columns=['name', 'height', 'body weight']) #Create an empty DataFrame with the specified column name.
print(df_humans_empty)
Pandas_7.py
import pandas as pd
series_name = pd.Series(['Ichiro', 'Jiro', 'Saburo', 'Siro'])
series_height = pd.Series([200, 173, 141, 172])
series_weight = pd.Series([100, 72, 40, 72])
series_gender = pd.Series(['Man', 'Man', 'woman', 'Man'])
series_bmi = pd.Series([25, 24, 20, 24.9])
df_humans = pd.DataFrame({'name': series_name, 'height': series_height, 'weight': series_weight}, 'gender': series_gender, 'bmi': series_bmi)
df_humans.index = ['Ichiro', 'Jiro', 'Saburo', 'Shiro']
df_humans.columns = ['name', 'height', 'body weight', 'sex', 'BMI']
print(df_humans['name']) # 「name」の列を取り出す。
print(df_humans.name) # これでも「name」の列を取り出せる。
print(df_humans.loc['Ichiro', 'body weight']) #Extract by specifying the row name and column name.
print(df_humans.loc[['Ichiro', 'Jiro'], ['height', 'body weight', 'BMI']]) #You can also specify multiple rows and multiple columns.
print(df_humans.loc['Jiro']) #Extracts the entire specified line.
print(df_humans.loc[:, 'BMI']) #Extracts the entire specified column.
print(df_humans.iloc[0, 2]) #Extract by specifying the row index number and column index number.
print(df_humans.iloc[[0, 1], [1, 2, 4]]) #You can also specify multiple rows and multiple columns.
print(df_humans.iloc[1]) #Extracts the entire specified line.
print(df_humans.iloc[:, 4]) #Extracts the entire specified column.
print(df_humans[df_humans['BMI'] >= 25]) #Extract only the rows that meet the conditions.
print(df_humans[(df_humans['height'] >= 170) & (df_humans['body weight'] >= 70)]) #Multiple conditions(and)But it is possible.
print(df_humans[(df_humans['body weight'] < 70) | (df_humans['BMI'] < 25)]) #Multiple conditions(or)But it is possible.
print(df_humans[df_humans['BMI'] < 25]['name']) #It is also possible to specify columns by filtering the rows that satisfy the conditions.
Pandas_8.py
import pandas as pd
series_name = pd.Series(['Ichiro', 'Jiro', 'Saburo', 'Siro'])
series_height = pd.Series([200, 173, 141, 172])
series_weight = pd.Series([100, 72, 40, 72])
series_gender = pd.Series(['Man', 'Man', 'woman', 'Man'])
series_bmi = pd.Series([25, 24, 20, 24.9])
df_humans = pd.DataFrame({'name': series_name, 'height': series_height, 'weight': series_weight}, 'gender': series_gender, 'bmi': series_bmi)
df_humans.index = ['Ichiro', 'Jiro', 'Saburo', 'Shiro']
df_humans.columns = ['name', 'height', 'body weight', 'sex', 'BMI']
df_humans = df_humans.sort_values(by='body weight') # body weightで昇順にソートする。
print(df_humans)
df_humans = df_humans.sort_values(by='body weight', ascending=False) # body weightで降順にソートする。
print(df_humans)
df_humans = df_humans.sort_values(by=['body weight', 'BMI']) # body weight、BMIで昇順にソートする。
print(df_humans)
Pandas_9.py
import pandas as pd
series_name = pd.Series(['Ichiro', 'Jiro', 'Saburo'])
series_height = pd.Series([200, 173, 141])
series_weight = pd.Series([100, 72, 40])
df_humans = pd.DataFrame({'name': series_name, 'height': series_height, 'weight': series_weight})
df_humans.index = ['Ichiro', 'Jiro', 'Saburo']
df_humans.columns = ['name', 'height', 'body weight']
df_humans['sex'] = ['Man', 'Man', 'woman'] #Add a column.
df_humans['BMI'] = df_humans['body weight'] / ((df_humans['height'] / 100)** 2) #It is also possible to add the result calculated between columns.
print(df_humans)
df_humans.loc['Shiro'] = ['Siro', 170, 72, 'Man', 24.9] #Add a line.
print(df_humans)
df_humans_2 = df_humans.drop('Saburo') #Delete the row.
print(df_humans_2)
df_humans_3 = df_humans.drop('sex', axis=1) #Delete the column.
print(df_humans_3)
Pandas_10.py
import pandas as pd
df_csv = pd.read_csv('filepath/filename.csv') #Read the CSV file.
df_text = pd.read_csv('filepath/filename.txt', sep='¥t') #Read a tab-delimited text file.
df_excel = pd.read_excel('filepath/filename.xlsx') #Read the Excel file.
df_csv_2 = pd.read_csv('filepath/filename_2.csv', header=1) #If the first row is blank and you want the second row to be the column name.
df_csv_3 = pd.read_csv('filepath/filename_3.csv', header=None) #If there is no column name.
df_excel_sheet2 = pd.read_excel('filepath/filename.xlsx', sheet_name=1) #Specify the index number (starting from 0) of the sheet.
df_excel_sheet2 = pd.read_excel('filepath/filename.xlsx', sheet_name='sheet2') #Specify the sheet name.
df_csv.to_csv('filepath/filename.csv') #Export a CSV file.
df_text.to_csv('filepath/filename.txt', sep='¥t') #Export a tab-delimited text file.
df_excel.to_excel('filepath/filename.xlsx') #Export an excel file.
df_csv.to_csv('filepath/filename.csv', index=False) #If you don't need the index number in the leftmost column.
Here, we introduced the basics of Pandas. Once you can do all of this, you will be able to read external files, process them, and write them out.
What is the programming language Python? Can it be used for AI and machine learning?
Recommended Posts