We have summarized the methods that are often used when analyzing data with pandas. We will make corrections as needed.
#Create a DataFrame and store the list
pd_sample = pd.DataFrame(list)
#Create a DataFrame and store multiple lists in a dictionary
pd_sample = pd.DataFrame({"Column A":List A, "Column B":List B})
#Create an empty DataFrame and join another DataFrame in subsequent processing
pd_sampleA = pd.DataFrame()
pd_sampleA = pd.concat([pd_sampleA, pd_sampleB])
#pandas.concat()function
pd_sampleAll = pd.concat([pd_sampleA, pd_sampleB], ignore_index=True)
pd_sampleA
Column A | Column B | Column C | |
---|---|---|---|
0 | a | b | c |
pd_sampleB
Column C | Column D | |
---|---|---|
0 | y | z |
pd_sampleAll = pd.concat([pd_sampleA, pd_sampleB], ignore_index=True)
Column A | Column B | Column C | Column D | |
---|---|---|---|---|
0 | a | b | c | NaN |
1 | NaN | NaN | y | z |
#pandas.merge()function
pd_sampleAll = pd.merge(pd_sampleA, pd_sampleB, on="Column label", how="left")
#pandas.DataFrame merge()Method
pd_sampleAll = pd_sampleA.merge(pd_sampleB, on="Column label", how="left")
#Combine multiple columns into a key
pd_sampleAll = pd.merge(pd_sampleA, pd_sampleB, on=["Column A", "Column B"], how="left")
#Join when key column names are different
pd_sampleAll = pd.merge(pd_sampleA, pd_sampleB, left_on="Column A", right_on="Column AAA", how="left")
pd_sampleA
Column A | Column B | Column C | |
---|---|---|---|
0 | a | b | c |
1 | aaa | bbb | ccc |
pd_sampleB
Column A | Column D | |
---|---|---|
0 | a | x |
1 | aa | y |
** pd_sampleAll = pd.merge (pd_sampleA, pd_sampleB, on = "column A", how = "left") **
Column A | Column B | Column C | Column D | |
---|---|---|---|---|
0 | a | b | c | x |
1 | aaa | bbb | ccc | NaN |
#Specify column name pandas.Extract DataFrame
pd_sampleA = pd_sampleAll[["Column A", "Column B"]]
#pandas.loc()Extract by specifying the position of row label and column label with function
#Series type is returned when a single row is fetched, and DataFrame type is returned when multiple rows are fetched.
#In addition to the line name, you can also specify a conditional expression that returns a boolean type in the line label.
#You can also slice multiple columns and rows
pd_sampleA = pd_sampleAll.loc["Row label", "Column label"]
pd_sampleA = pd_sampleAll["Column label"].loc["Row label"]
#pandas.iloc()Extract by specifying the position of row number and column number with the function
#Series type is returned when a single row is fetched, and DataFrame type is returned when multiple rows are fetched.
#You can also slice multiple columns and rows
pd_sampleA = pd_sampleAll.iloc["line number", "Column index"]
pd_sampleAll
Column A | Column B | Column C | Column D | |
---|---|---|---|---|
0 | a | b | c | d |
1 | aaa | bbb | ccc | ddd |
** pd_sampleA = pd_sampleAll.loc [pd_sampleAll ["Column A"] == "aaa", ["Column B", "Column C"]] **
Column B | Column C | |
---|---|---|
1 | bbb | ccc |
#pandas.Series.isnull()Get the row where the missing value exists in the method
pd_sample = pd_sample["Column A"].isnull()
#isnull()Any to method()Check for the presence of columns containing missing values using
pd_sample["Column A"].isnull().any(axis=0)
#pandas.DataFrame.dropna()Delete a row that is missing a particular column in the method
pd_sample = pd_sample.dropna(subset=["Column A"])
#pandas.DataFrame.fillna()Replace missing values with arbitrary values in the method
pd_sample = pd_sample.fillna({"Column A": "AAA", "Column B": "BBB"}))
#pandas.to_datetime()String in function(object type)To datetime type
pd_sample["date"] = pd.to_datetime(pd_sample["date"])
#Get date, hour, minute, second, day of the week, etc. using dt accessor for datetime type
pd_sample["Month"] = pd_sample["date"].dt.month
#datetime type strftime()Convert date and time to a string in any format with the method
pd_sample["Year / month"] = pd_sample["date"].dt.strftime("%Y%m")
#pandas.to_timedelta()Get the number of days difference between two dates with a function
import datetime
pd_sample["Number of days elapsed"] = pd.to_timedelta(Timestamp 1-Timestamp 2)
#Get the monthly difference between two dates using the relativedelta object of the dateutil module
#The difference between the two dates in the return value(years=+1, months=+10, days=+8)Is stored in the form of
from dateutil.relativedelta import relativedelta
delta = relativedelta(Timestamp 1,Timestamp 2)
pd_sample["Elapsed months"] = delta.years*12 + delta.months
#pandas.groupby()Aggregate by specifying the column you want to aggregate with the function and the aggregation method
#By default the group label is index, so if you don't want to do that, as_index=Specify False.
pd_count = pd_sample.groupby("The column you want to put together").count()
pd_sum = pd_sample.groupby(["Column 1 you want to summarize", "Column 2 you want to summarize"], as_index=False).sum()[["Column 1 you want to sum", "Column 2 you want to sum"]]
#pandas.unique()Get unique data count excluding duplicates with function
pd_unique = pd.unique(pd_sample["Column A"])
pd_unique = pd_sample["Column A"].unique()
#pandas.agg()The function outputs the mean, median, maximum, and minimum values for a particular column
pd_sample = pd_sample.agg(["mean", "median", "max", "min"])["Columns you want to aggregate"]
#pandas.DataFrame.describe()Outputs the number of data items, mean value, standard deviation, minimum value, quartile, median value, and maximum value with the function
#By default, only numeric columns (integer type int, floating point type float) are selected
pd_sample.describe
#Manipulate strings using str accessor for DateFrame type
pd_sample["Column A"] = pd_sample["Column A"].str.replace(" ", "")
#pandas.Where in DataFrame()Assign a value according to the condition in the method
#If True, the calling object is applied, if False, the second argument is applied
pd_sample["Column A"] = pd_sample["Column A"].where(pd_sample["Column B"]>3, 0)
#pandas.DataFrame mask()Assign a value according to the condition in the method
#If True, the second argument is applied, if False, the calling object is applied.
pd_sample["Column A"] = pd_sample["Column A"].mask(pd_sample["Column B"]>3, 0)
Recommended Posts