Recently, I studied Pandas by solving Data Science 100 Knock (Structured Data Processing). First of all, I searched for the parts I did not understand without looking at the answers, solved all the questions, and then studied by matching the answers, but through this study, I summarized the processes that I thought I could use in practice for myself. I did.
#Read file. Character code is UTF-8, Tab delimited
df = pd.read_csv('data/d.csv', header=0, encoding='utf-8', sep='\t')
#File writing. There is a header, and the character code is UTF-8, comma separated
df.to_csv('data/d.csv', header=True, encoding='utf-8', sep=',')
#File writing. No header, character code is CP932, Tab delimited
df.to_csv('data/d.csv', header=False, encoding='cp932', sep='\t')
df2 = df.copy()
pd.merge(df_receipt, df_store[['store_cd', 'store_name']], on='store_cd', how='inner')
#inner:Inner join, left:Left outer join, right:Right outer join, outer:Full outer join
pd.concat([df1, df2], axis=1)
#Unlike merge, pass it as a list. axis axis=If 0, vertical, axis=If it is 1, they are combined in the horizontal direction.
len(df)
len(df['id'].unique())
df['id'].value_counts()
#Check the number of missing items in each column
df.isnull().sum()
#Delete records that contain at least one missing value
df.dropna()
#fillna is a dictionary type and can be specified at once
df.fillna({'price': mean(df['price']),
'cost': median(df['cost'])})
#Set the target column in subset and delete duplicates
df.drop_duplicates(subset=['name', 'cd'], keep='first', inplace=True)
#Change any column name with rename
df.rename(columns={'ymd':'sales_date'}, inplace=True)
#Rewrite directly in the list
df.columns = ['category', 'price', 'cost']
#Convert to string type
df['sales_date'].astype(str)
# True,False to 1,Convert to 0
(df['sales']>2000).astype(int)
code_dict = {
'A': 11,
'B': 22,
'C': 33,
'D': 44
}
df['name'].replace(code_dict) #If they do not match, leave them as they are
df['name'].map(code_dict) #If they do not match, NAN
df.loc[df['age10']=='60s', 'age10'] = '60s以上'
(df['sales'] != 0).apply(int) #If sales is not 0, 1 is flagged
#Round off
df['price'].apply(lambda x: np.round(x))
# 1.Multiply and truncate after the decimal point
df['price'].apply(lambda x: np.floor(x * 1.1))
#Common logarithm (bottom)=10)
df['sales'].apply(lambda x: math.log10(x))
#Make the difference between the months in the two columns
df[['end_date', 'start_date']].apply(\
lambda x: relativedelta(x[0], x[1]).years * 12 + relativedelta(x[0], x[1]).months, axis=1)
#X for missing values[1](Median). Otherwise as it is
df[['price', 'median_price']].apply(\
lambda x: np.round(x[1]) if np.isnan(x[0]) else x[0], axis=1)
#When writing in one line
df['sales_ss'] = preprocessing.scale(df['sales'])
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(customer[['sales']])
customer['sales_ss'] = scaler.transform(customer[['sales']])
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(customer[['sales']])
customer['sales_mm'] = scaler.transform(customer[['sales']])
pd.get_dummies(df, columns=['cd'])
#If columns are specified, only specific columns can be applied, and it can be dummy even if it is not an object type.
#Extraction of the first 3 characters
df['name'].str[0:3]
#String concatenation
df['gender'].str.cat((df['age10']).astype(str)) #Combine the gender and age10 strings
#Data extraction by query method. Can be written simply when specifying multiple conditions
df[['sales_date', 'id', 'cd', 'quantity', 'amount']]\
.query('id == "XXXX" & (amount >= 1000 | quantity >=5)')
#(Reference) If the query method is not used, it will be complicated if the conditions are complicated.
target = (df['id']=="XXXX") & ((df['amount']>=1000) | (df['quantity']>=5))
df[target][['sales_date', 'id', 'cd', 'quantity', 'amount']]
# "SASS"Extract lines starting with
df['store'].str.startswith('SASS')
#Extract lines ending in 1
df['id'].str.endswith('1')
#Extract lines including Sapporo
df['address'].str.contains('Sapporo')
#Judgment using regular expressions
df['cd'].str.contains('^[A-D]', regex=True) #Start with one of A to D
df['cd'].str.contains('[1-9]$', regex=True) #Ends with one of 1-9
df['cd'].str.contains('^[A-D].*[1-9]$', regex=True) #Starts with either A to D and ends with any of 1-9
df['tel'].str.contains('^[0-9]{3}-[0-9]{3}-[0-9]{4}', regex=True) #Phone number is 3 digits-3 digits-4 digits
#Descending sort based on sales column
df.sort_values('sales', ascending=True).head(10) #If set to False, ascending order
#Sort based on multiple columns. You can also specify descending or ascending order for each column.
df.sort_values(['sales', 'id'], ascending=[False, True])
#In descending order of sales (ascending)=False) down the rank. method='min'Then, when the values are the same, the same numbers are assigned.
df['sales'].rank(method='min', ascending=False)
#Select the aggregate function to apply for each column to be aggregated
df.groupby('id', as_index=False).agg({'amount':'sum', 'quantity':'sum'})
#Apply multiple aggregates to one column
df.groupby('id', as_index=False).agg({'ymd':['max', 'min']})
#Arbitrary function (here pd.Series.mode) is specified
df.groupby('id', as_index=False).agg({'cd': pd.Series.mode})
#Specify anonymous function lambda
df.groupby('id', as_index=False).agg({'amount': lambda x: np.var(x)})
#It is also possible to write without using the agg function
df.groupby('id', as_index=False).max()[['id', 'ymd']]
df.groupby('id', as_index=False).sum()[['id', 'amount', 'quantity']]
#(Reference) Use the agg function when aggregating the mode by groupby
df.groupby('id', as_index=False).mode() #I get an error
pd.pivot_table(df, index='age10', columns='gender', values='amount', aggfunc='sum')
#index:Front side, columns:Front, values:Target value, aggfunc:Aggregation method)
pd.pivot_table(sales, index='id', columns='year', values='amount', aggfunc='sum', margins=True)
#margins=If True, you can get totals and subtotals
df.quantile([0, 0.25, 0.5, 0.75, 1.0])[['amount']]
#Character string → datetime type
pd.to_datetime(df['str_date'])
#Epoch seconds → datetime type
pd.to_datetime(df['epoch'], unit='s')
#datetime type → epoch seconds
df['date'].astype(np.int64) // 10**9
#datetime type → character string%Y:4 digits per year, %m:2 digits per month, %d:Two digits per day. *%Note that uppercase and lowercase have different meanings. For example%M means minutes
df['date'].dt.strftime('%Y%m%d')
#Extract year information. You can retrieve the month by month and the date by day.
t.dt.year
#Use strftime to get a 0 padded 2-digit string
t.dt.strftime('%d')
#To get the difference in days, simply subtract the datetime type
df['end_date'] - df['start_date']
#Use relative delta to get the difference between the months
relativedelta(x0, x1).years * 12 + relativedelta(x0, x1).months
#Output the day of the week as a number (the number of days from Monday) with the weekday function
df['Days elapsed since Monday'] = df['ymd'].apply(lambda x: x.weekday())
#Get the Monday date of the week of the day
df['Monday of the week'] = df['ymd'].apply(lambda x: x - relativedelta(days=x.weekday()))
#Difference from 1 time ago
df['sales'].diff(1)
#Rate of change from one time ago
df['sales'].pct_change(1)
#Value one time ago
df['sales'].shift(1)
#Value before 2 points
df['sales'].shift(2)
#Randomly 10%Data sampling
df.sample(frac=0.1, random_state=5)
#Randomly 10 based on gender percentage%Stratified sampling of data
_, test = train_test_split(df, test_size=0.1, stratify=df['gender'], random_state=5)
#8 for training data and test data:Divide by a ratio of 2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=5)
# 1:Undersampling to 1
r = RandomUnderSampler(random_state=5)
df_sample, _ = r.fit_sample(df, df['flag'])
In studying, I referred to the following. Thank you very much. -100 knocks of data science (structured data processing)
Recommended Posts