Module minder_utils.feature_engineering.util
Expand source code
import pandas as pd
import datetime as dt
import pandas as pd
from dateutil.relativedelta import relativedelta
def compute_week_number(df):
df = pd.to_datetime(df, utc=True, infer_datetime_format=True)
return df.dt.isocalendar().week + (df.dt.isocalendar().year - 2000) * 100
def datetime_to_time(datetimes):
'''
This function simply conerts date time strings to just times.
This can be useful when we want to plot histograms of average
events on a daily basis.
Arguments
---------
datetimes: iterable
This is an iterable list, array or series of strings
of datetimes. This must be understandable by
dt.datetime.time().
Returns
--------
times: DatetimeIndex
This is a series contaning the new times. They will all
have date value of 1900-01-01.
'''
times = [dt.datetime.time(d) for d in datetimes]
times = pd.to_datetime(times, format="%H:%M:%S")
return times
def datetime_to_day(datetimes):
'''
This function simply conerts date time strings to just days.
This can be useful when we want to plot histograms of average
events on a daily basis.
Arguments
---------
datetimes: iterable
This is an iterable list, array or series of strings
of datetimes. This must be understandable by
dt.datetime.date().
Returns
--------
days: DatetimeIndex
This is a series contaning the new days.
'''
days = [dt.datetime.date(d) for d in datetimes]
days = pd.to_datetime(days, format="%Y-%m-%d")
return days
def week_to_date(df: pd.DataFrame, day_of_week: int = 1):
'''
Calculate the date according to the week index. Note the week index is calcualted
according to function ```compute_week_number```
Args:
df: Dataframe, a panda series contains the week index, e.g. fe.activity.week
day_of_week: int, the index of the day of the week, e.g. Monday = 1
Returns: Dataframe, a panda series contains the dates
'''
def cal_weeks(week_idx):
year = 2000 + int(week_idx / 100)
week = week_idx - int(week_idx / 100) * 100
return pd.to_datetime('{}-{}-{}'.format(year, week, day_of_week), format='%Y-%W-%w')
return df.apply(cal_weeks)
def frequencies_tp(input_df, tp=3, return_locations=True):
'''
Arguments
---------
- input_df: pandas dataframe:
This input. This must contain the columns ```'time'``` and ```'location'```.
- tp: int:
This is the number of hours the data will be grouped by.
- return_locations: bool:
This dictates whether the individual locations will be returned
as a list along with the dataframe.
Returns
----------
- out: pandas dataframe:
This dataframe contains the one hot coded ```'locations'``` column,
summed over ```tp```.
'''
one_hot = pd.get_dummies(input_df['location'])
out = pd.concat([input_df, one_hot], axis=1)
out['time'] = pd.to_datetime(out['time'])
out = out.groupby([pd.Grouper(key='time', freq='{}h'.format(tp),
origin='start_day',
dropna=False)])[one_hot.columns].sum().reset_index().fillna(0)
if return_locations:
return out, list(one_hot.columns)
else:
return out
Functions
def compute_week_number(df)
-
Expand source code
def compute_week_number(df): df = pd.to_datetime(df, utc=True, infer_datetime_format=True) return df.dt.isocalendar().week + (df.dt.isocalendar().year - 2000) * 100
def datetime_to_day(datetimes)
-
This function simply conerts date time strings to just days. This can be useful when we want to plot histograms of average events on a daily basis.
Arguments
datetimes
:iterable
This is an iterable list, array or series of strings of datetimes. This must be understandable by dt.datetime.date().
Returns
days
:DatetimeIndex
- This is a series contaning the new days.
Expand source code
def datetime_to_day(datetimes): ''' This function simply conerts date time strings to just days. This can be useful when we want to plot histograms of average events on a daily basis. Arguments --------- datetimes: iterable This is an iterable list, array or series of strings of datetimes. This must be understandable by dt.datetime.date(). Returns -------- days: DatetimeIndex This is a series contaning the new days. ''' days = [dt.datetime.date(d) for d in datetimes] days = pd.to_datetime(days, format="%Y-%m-%d") return days
def datetime_to_time(datetimes)
-
This function simply conerts date time strings to just times. This can be useful when we want to plot histograms of average events on a daily basis.
Arguments
datetimes
:iterable
This is an iterable list, array or series of strings of datetimes. This must be understandable by dt.datetime.time().
Returns
times
:DatetimeIndex
- This is a series contaning the new times. They will all have date value of 1900-01-01.
Expand source code
def datetime_to_time(datetimes): ''' This function simply conerts date time strings to just times. This can be useful when we want to plot histograms of average events on a daily basis. Arguments --------- datetimes: iterable This is an iterable list, array or series of strings of datetimes. This must be understandable by dt.datetime.time(). Returns -------- times: DatetimeIndex This is a series contaning the new times. They will all have date value of 1900-01-01. ''' times = [dt.datetime.time(d) for d in datetimes] times = pd.to_datetime(times, format="%H:%M:%S") return times
def frequencies_tp(input_df, tp=3, return_locations=True)
-
Arguments
-
input_df: pandas dataframe: This input. This must contain the columns
'time'
and'location'
. -
tp: int: This is the number of hours the data will be grouped by.
-
return_locations: bool: This dictates whether the individual locations will be returned as a list along with the dataframe.
Returns
- out: pandas dataframe:
This dataframe contains the one hot coded
'locations'
column, summed overtp
.
Expand source code
def frequencies_tp(input_df, tp=3, return_locations=True): ''' Arguments --------- - input_df: pandas dataframe: This input. This must contain the columns ```'time'``` and ```'location'```. - tp: int: This is the number of hours the data will be grouped by. - return_locations: bool: This dictates whether the individual locations will be returned as a list along with the dataframe. Returns ---------- - out: pandas dataframe: This dataframe contains the one hot coded ```'locations'``` column, summed over ```tp```. ''' one_hot = pd.get_dummies(input_df['location']) out = pd.concat([input_df, one_hot], axis=1) out['time'] = pd.to_datetime(out['time']) out = out.groupby([pd.Grouper(key='time', freq='{}h'.format(tp), origin='start_day', dropna=False)])[one_hot.columns].sum().reset_index().fillna(0) if return_locations: return out, list(one_hot.columns) else: return out
-
def week_to_date(df: pandas.core.frame.DataFrame, day_of_week: int = 1)
-
Calculate the date according to the week index. Note the week index is calcualted according to function
compute_week_number
Args
df
- Dataframe, a panda series contains the week index, e.g. fe.activity.week
day_of_week
- int, the index of the day of the week, e.g. Monday = 1
Returns: Dataframe, a panda series contains the dates
Expand source code
def week_to_date(df: pd.DataFrame, day_of_week: int = 1): ''' Calculate the date according to the week index. Note the week index is calcualted according to function ```compute_week_number``` Args: df: Dataframe, a panda series contains the week index, e.g. fe.activity.week day_of_week: int, the index of the day of the week, e.g. Monday = 1 Returns: Dataframe, a panda series contains the dates ''' def cal_weeks(week_idx): year = 2000 + int(week_idx / 100) week = week_idx - int(week_idx / 100) * 100 return pd.to_datetime('{}-{}-{}'.format(year, week, day_of_week), format='%Y-%W-%w') return df.apply(cal_weeks)