Mémo de génération de fonction de date

Notes de code pour générer des fonctionnalités et des objectifs à partir de données de type date sans utiliser featuretools

datefeaturetool.py


import datetime
import numpy as np
import pandas as pd

class DeltaDate():
    """
Génération de caractéristiques de date
    """
    def __init__(self, cutoff_date):
        """
        cutoff_date: datetime.date(2020, 2, 2) or pandas.Timestamp('2020-02-02')
                     int, numpy.int64, float, numpy.float64
        """
        if type(cutoff_date) == datetime.date:
            self.cutoff_date = pd.to_datetime(cutoff_date)
            print('cutoff_date converted from datetime.date type to pandas.Timestamp type.')
        else:
            self.cutoff_date = cutoff_date
    
    def delta_date_1d(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series 
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (np.int64)
        """
        day_lt = ['d', 'D', 'day', 'Day']
        month_lt = ['m', 'M', 'month', 'Month']
        year_lt = ['y', 'Y', 'year', 'Year']
        
        tcd = type(self.cutoff_date)
        if tcd == pd._libs.tslibs.timestamps.Timestamp:
            if freq in day_lt:
                delta = self.cutoff_date - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                start_year = dates.dt.year  # timedelta64[ns]
                start_month = dates.dt.month  # timedelta64[ns]
                start_day = dates.dt.day  # timedelta64[ns]
                end_year = self.cutoff_date.year  # np.int64
                end_month = self.cutoff_date.month  # np.int64
                end_day = self.cutoff_date.day  # np.int64
                cond = ((end_month<start_month)|((end_month==start_month)&(end_day<start_day)))
                if freq in month_lt:
                    delta = (end_year - start_year) * 12 + (end_month - start_month)
                    delta = delta.mask(cond, delta - 1)  # np.int64
                else:
                    delta = end_year - start_year
                    delta = delta.mask(cond, delta - 1)  # np.int64
            else:
                print("freq must be 'day', 'month' or 'year'")
        elif (tcd==int)|(tcd==np.int64)|(tcd==float)|(tcd==np.float64):
            if freq in day_lt:
                y = self.cutoff_date // 10000
                m = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100
                d = self.cutoff_date - self.cutoff_date//100 * 100
                cod = pd.Timestamp(year=y, month=m, day=d)
                dates = pd.to_datetime(dates.astype(str), format='%Y%m%d')
                delta = cod - dates  # timedelta64[ns]
                delta = delta.dt.days  # np.int64
            elif freq in (month_lt + year_lt):
                y_diff = self.cutoff_date//10000 - dates//10000
                m_diff = (self.cutoff_date - self.cutoff_date//10000 * 10000)//100 - (dates - dates//10000 * 10000)//100
                d_diff = (self.cutoff_date - self.cutoff_date//100 * 100) - (dates - dates//100 * 100)
                cond = (m_diff < 0) | ((m_diff == 0) & (d_diff < 0))
                if freq in month_lt:
                    delta = y_diff * 12 + m_diff
                    delta = delta.mask(cond, delta - 1)
                else:
                    delta = y_diff
                    delta = delta.mask(cond, delta - 1)
        else:
            print("freq must be 'day', 'month' or 'year'")
        
        if past_or_future in ['f', 'future']:
            delta = -delta
            print('delta for the future.')
        
        delta.name = 'elapsed_' + delta.name
        
        return delta
    
    def delta_date(self, dates, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (np.int64)
        """
        if type(dates) == pd.core.series.Series:
            delta = self.delta_date_1d(dates, freq, past_or_future)
        elif type(dates) == pd.core.frame.DataFrame:
            s_lt = []
            for col in dates:
                dd = self.delta_date_1d(dates[col], freq, past_or_future)
                s_lt += [dd]
            delta = pd.concat(s_lt, axis=1)
        else:
            print('dates must be andas.Series or pd.DataFrame.')
        return delta
    
    def within_date(self, dates, within, freq='d', past_or_future='past'):
        """
        dates: pandas.Series or pandas.DataFrame
            dtype: datetime64[ns]
                   int64, float64
        within: int (Dans les n jours, dans les n mois, dans les n années)
        freq: 'day', 'month' or 'year'
        past_or_future: 'past' or 'future'
        
        return pandas.Series (0: over, 1: within, np.nan: minus)
        """
        if type(within) == list:
            delta_sign_lt = []
            for n in within:
                delta = self.delta_date(dates, freq, past_or_future)
                delta_sign = delta.mask(delta>n, 0)
                delta_sign = delta_sign.mask(delta<=n, 1)
                delta_sign = delta_sign.mask(delta<0)
                if type(delta_sign) == pd.core.frame.DataFrame:
                    delta_sign.columns = ['within' + str(n) + c for c in dates.columns]
                else:
                    delta_sign.name = 'within' + str(n) + dates.name
                delta_sign_lt+= [delta_sign]
            within_sign = pd.concat(delta_sign_lt, axis=1)
        else:
            delta = self.delta_date(dates, freq, past_or_future)
            delta_sign = delta.mask(delta>within, 0)
            delta_sign = delta_sign.mask(delta<=within, 1)
            within_sign = delta_sign.mask(delta<0)
            within_sign.name = 'within' + str(within) + dates.name
        
        return within_sign

if __name__ == '__main__':
    df = pd.DataFrame([['2017-8-1', '2018-12-15'],
                       ['2020-2-2', '2019-3-31']],
                      columns=['date1', 'date2'])
    for c in df:
        df[c] = pd.to_datetime(df[c], format='%Y-%m-%d')
    
    deltadate = DeltaDate(datetime.date(2020, 2, 28))
    result = deltadate.delta_date(df, freq='d')
    within = deltadate.within_date(df, [12, 24], freq='m')

Recommended Posts

Mémo de génération de fonction de date
Génération de fonctionnalités avec pandas group par
Analyse des données avant la génération de fonctionnalités titanesques de Kaggle
Génération automatique Powerpo avec python-pptx (mémo personnel)