Notes de code pour générer des fonctionnalités et des objectifs à partir de données de type date sans utiliser featuretools
datefeaturetool.py
import datetime
import numpy as np
import pandas as pd
class DeltaDate():
"""
Génération de caractéristiques de date
"""
def __init__(self, cutoff_date):
"""
cutoff_date: datetime.date(2020, 2, 2) or pandas.Timestamp('2020-02-02')
int, numpy.int64, float, numpy.float64
"""
if type(cutoff_date) == datetime.date:
self.cutoff_date = pd.to_datetime(cutoff_date)
print('cutoff_date converted from datetime.date type to pandas.Timestamp type.')
else:
self.cutoff_date = cutoff_date
def delta_date_1d(self, dates, freq='d', past_or_future='past'):
"""
dates: pandas.Series
dtype: datetime64[ns]
int64, float64
freq: 'day', 'month' or 'year'
past_or_future: 'past' or 'future'
return pandas.Series (np.int64)
"""
day_lt = ['d', 'D', 'day', 'Day']
month_lt = ['m', 'M', 'month', 'Month']
year_lt = ['y', 'Y', 'year', 'Year']
tcd = type(self.cutoff_date)
if tcd == pd._libs.tslibs.timestamps.Timestamp:
if freq in day_lt:
delta = self.cutoff_date - dates # timedelta64[ns]
delta = delta.dt.days # np.int64
elif freq in (month_lt + year_lt):
start_year = dates.dt.year # timedelta64[ns]
start_month = dates.dt.month # timedelta64[ns]
start_day = dates.dt.day # timedelta64[ns]
end_year = self.cutoff_date.year # np.int64
end_month = self.cutoff_date.month # np.int64
end_day = self.cutoff_date.day # np.int64
cond = ((end_month<start_month)|((end_month==start_month)&(end_day<start_day)))
if freq in month_lt:
delta = (end_year - start_year) * 12 + (end_month - start_month)
delta = delta.mask(cond, delta - 1) # np.int64
else:
delta = end_year - start_year
delta = delta.mask(cond, delta - 1) # np.int64
else:
print("freq must be 'day', 'month' or 'year'")
elif (tcd==int)|(tcd==np.int64)|(tcd==float)|(tcd==np.float64):
if freq in day_lt:
y = self.cutoff_date // 10000
m = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100
d = self.cutoff_date - self.cutoff_date//100 * 100
cod = pd.Timestamp(year=y, month=m, day=d)
dates = pd.to_datetime(dates.astype(str), format='%Y%m%d')
delta = cod - dates # timedelta64[ns]
delta = delta.dt.days # np.int64
elif freq in (month_lt + year_lt):
y_diff = self.cutoff_date//10000 - dates//10000
m_diff = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100 - (dates - dates//10000 * 10000)//100
d_diff = (self.cutoff_date - self.cutoff_date//100 * 100) - (dates - dates//100 * 100)
cond = (m_diff < 0) | ((m_diff == 0) & (d_diff < 0))
if freq in month_lt:
delta = y_diff * 12 + m_diff
delta = delta.mask(cond, delta - 1)
else:
delta = y_diff
delta = delta.mask(cond, delta - 1)
else:
print("freq must be 'day', 'month' or 'year'")
if past_or_future in ['f', 'future']:
delta = -delta
print('delta for the future.')
delta.name = 'elapsed_' + delta.name
return delta
def delta_date(self, dates, freq='d', past_or_future='past'):
"""
dates: pandas.Series or pandas.DataFrame
dtype: datetime64[ns]
int64, float64
freq: 'day', 'month' or 'year'
past_or_future: 'past' or 'future'
return pandas.Series (np.int64)
"""
if type(dates) == pd.core.series.Series:
delta = self.delta_date_1d(dates, freq, past_or_future)
elif type(dates) == pd.core.frame.DataFrame:
s_lt = []
for col in dates:
dd = self.delta_date_1d(dates[col], freq, past_or_future)
s_lt += [dd]
delta = pd.concat(s_lt, axis=1)
else:
print('dates must be andas.Series or pd.DataFrame.')
return delta
def within_date(self, dates, within, freq='d', past_or_future='past'):
"""
dates: pandas.Series or pandas.DataFrame
dtype: datetime64[ns]
int64, float64
within: int (Dans les n jours, dans les n mois, dans les n années)
freq: 'day', 'month' or 'year'
past_or_future: 'past' or 'future'
return pandas.Series (0: over, 1: within, np.nan: minus)
"""
if type(within) == list:
delta_sign_lt = []
for n in within:
delta = self.delta_date(dates, freq, past_or_future)
delta_sign = delta.mask(delta>n, 0)
delta_sign = delta_sign.mask(delta<=n, 1)
delta_sign = delta_sign.mask(delta<0)
if type(delta_sign) == pd.core.frame.DataFrame:
delta_sign.columns = ['within' + str(n) + c for c in dates.columns]
else:
delta_sign.name = 'within' + str(n) + dates.name
delta_sign_lt+= [delta_sign]
within_sign = pd.concat(delta_sign_lt, axis=1)
else:
delta = self.delta_date(dates, freq, past_or_future)
delta_sign = delta.mask(delta>within, 0)
delta_sign = delta_sign.mask(delta<=within, 1)
within_sign = delta_sign.mask(delta<0)
within_sign.name = 'within' + str(within) + dates.name
return within_sign
if __name__ == '__main__':
df = pd.DataFrame([['2017-8-1', '2018-12-15'],
['2020-2-2', '2019-3-31']],
columns=['date1', 'date2'])
for c in df:
df[c] = pd.to_datetime(df[c], format='%Y-%m-%d')
deltadate = DeltaDate(datetime.date(2020, 2, 28))
result = deltadate.delta_date(df, freq='d')
within = deltadate.within_date(df, [12, 24], freq='m')