Module minder_utils.feature_engineering.TimeFunctions
Expand source code
import numpy as np
import pandas as pd
from .DensityFunctions import BaseDensityCalc
def raw_delta_calc(times):
'''
Given an array of times, this function calculates the deltas between them.
Arguments
---------
- times: array:
This is an array of times that will be used to calculate the deltas.
Returns
---------
- out: array:
This is an array of deltas.
'''
out = (times[1:] - times[:-1])*1e-9
out = out.astype(float)
return out
def single_location_delta(input_df, single_location,
columns={'time': 'time', 'location': 'location'}, recall_value=5,
return_as_list = False):
'''
This function takes the ```input_df``` and calculates the raw time delta between the single_location location time
and the time of the ```recall_value``` number of locations immediately before the single_location.
This does not separate on subject. Please pass data from a single subject into this function.
Arguments
---------
- input_df: pandas dataframe:
This is a dataframe that contains columns relating to the subject, time and location of sensor trigger.
- single_location: string:
This is the location value that you wish to calculate the delta to.
- columns: dictionary:
This is the dictionary with the column names in ```input_df``` for each of the values of data that we need
in our calculations.
This dictionary should be of the form:
```
{'time': column containing the times of sensor triggers,
'location': column containing the locations of the sensor triggers}
```
- recall_value: integer:
This is the number of previous locations to the single_location trigger
- return_as_list: bool:
This option allows the user to return a list of the dates and data if ```True```. This is
used internally by other functions.
Returns
---------
- out: dictionary:
This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the
arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the
number of visits to ```single_location``` on a given day. If there are no ```single_location``` visits
found in the data, then an empty dictionary will be returned.
'''
time_column = columns['time']
location_column = columns['location']
# format the incoming data to ensure assumptions about structure are met
input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
input_df = input_df.sort_values(time_column)
# find the indices of the data that match with the location we want to find the delta to
single_location_indices = np.where(input_df[location_column] == single_location)[0].reshape(-1, 1)
# making sure that the recall value is not more than the number of sensor triggers before the
# first single_location sensor trigger
if len(single_location_indices) == 0:
if return_as_list: return [], []
else: return {}
single_location_indices = single_location_indices[np.argmax(recall_value < single_location_indices):]
# indices of the sensor triggers that we need in our calculations
recall_indices = np.hstack([single_location_indices - i for i in range(recall_value + 1)])
# the times of the sensor triggers
recall_times = input_df[time_column].values[recall_indices]
# the delta between the times for each of the previous sensors to recall_value
recall_delta = (recall_times[:, 0, None] - recall_times[:, 1:]) * 1e-9
# the times of the single_location triggers
single_location_times = input_df[time_column].iloc[single_location_indices.reshape(-1, )]
# dates of the single_location triggers
single_location_dates = single_location_times.dt.date
# out dictionary
out = {}
if return_as_list:
date_list = []
data_list = []
for nd, date in enumerate(single_location_dates.unique()):
date_list.append(date)
data_to_add = recall_delta[single_location_dates.values == date].astype(float)
data_list.append(data_to_add)
return pd.to_datetime(date_list), data_list
else:
# creating the output dictionary
for date in single_location_dates.unique():
# saving the delta values for this date to the dictionary
out[pd.to_datetime(date)] = recall_delta[single_location_dates.values == date].astype(float)
return out
class TimeDeltaDensity(BaseDensityCalc):
'''
This function allows the user to calculate reverse percentiles on some data, given another
dataset.
'''
def __init__(self, save_baseline_array=True, sample=False, sample_size=10000,
seed=None, verbose=True):
BaseDensityCalc.__init__(self, save_baseline_array=save_baseline_array,
sample=sample, sample_size=sample_size, seed=seed, verbose=verbose)
return
def rp_single_location_delta(input_df, single_location, baseline_length_days = 7, baseline_offset_days = 0,
columns={'time': 'time', 'location': 'location'}, recall_value=5):
'''
This function takes the ```input_df``` and calculates the reverse percentage time delta between the ```single_location``` location time
and the time of the ```recall_value``` number of locations immediately before the ```single_location```. The baseline
for the reverse percentage calculation is defined by ```baseline_length_days``` and ```baseline_offset_days```.
For example:
With ```baseline_length_days = 7``` and ```baseline_offset_days = 1```, the rp deltas on the day
```pd.Timestamp('2021-06-29')``` are calculated using the deltas from
```pd.Timestamp('2021-06-21 00:00:00')``` to ```pd.Timestamp('2021-06-28 00:00:00')```.
This does not separate on subject. Please pass data from a single subject into this function.
NOTE: The reverse percentage is calculated based on all of the deltas coming into a location!
This means that the delta is agnostic to the "from" location.
Arguments
---------
- input_df: pandas dataframe:
This is a dataframe that contains columns relating to the time and location of sensor trigger.
- single_location: string:
This is the location value that you wish to calculate the delta to.
- baseline_length_days: integer:
This is the length of the baseline in days that will be used. This value is used when finding
the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
- baseline_offset_days: integer:
This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
current date being calculated on.
- columns: dictionary:
This is the dictionary with the column names in ```input_df``` for each of the values of data that we need
in our calculations.
This dictionary should be of the form:
```
{'time': column containing the times of sensor triggers,
'location': column containing the locations of the sensor triggers}
```
- recall_value: integer:
This is the number of previous locations to the single_location trigger
Returns
---------
- out: dictionary:
This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the
arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the
number of visits to ```single_location``` on a given day.
'''
# column names
time_column = columns['time']
location_column = columns['location']
out = {}
# format the incoming data to ensure assumptions about structure are met
input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
input_df = input_df.sort_values(time_column)
# getting the single location raw delta
date_list, data_list = single_location_delta(input_df, single_location, columns, recall_value, return_as_list=True)
# for each date
for nd, date in enumerate(date_list):
date = pd.to_datetime(date)
'''
if len(baseline_offset)>0:
baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length) - pd.Timedelta(**baseline_offset))
baseline_end_tp = pd.to_datetime(date - pd.Timedelta(**baseline_offset))
else:
baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length))
baseline_end_tp = pd.to_datetime(date)
'''
index_baseline_end = np.where(date_list <= date)[0][-1]
index_baseline_end = index_baseline_end - baseline_offset_days
index_baseline_start = index_baseline_end - baseline_length_days
if index_baseline_start < 0:
out[date] = -1*np.ones_like(data_list[nd])
continue
baseline_delta = np.vstack([data_list[index] for index in range(index_baseline_start, index_baseline_end)])
td = TimeDeltaDensity(save_baseline_array=True, sample=True, sample_size=10000,
seed=None, verbose=False)
td.fit(baseline_delta)
out[date] = td.transform(data_list[nd])
return out
def rp_location_delta(data, columns = {'time': 'start_date', 'location': 'location_name'}, baseline_length_days = 7,
baseline_offset_days = 0, all_loc_as_baseline = False):
'''
This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline.
This function allows you to specify whether to calculate the rp values based on the deltas to the same location or
whether to calculate them using all locations.
Arguments
---------
- data: pandas dataframe:
This is the dataframe containing the time and locations that will be used to calculate the reverse
percentage deltas
- columns: dictionary:
This is the dictionary with the column names in ```input_df``` for each of the values of data that we need
in our calculations.
This dictionary should be of the form:
```
{'time': column containing the times of sensor triggers,
'location': column containing the locations of the sensor triggers}
```
- baseline_length_days: integer:
This is the length of the baseline in days that will be used. This value is used when finding
the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
- baseline_offset_days: integer:
This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
current date being calculated on.
- all_loc_as_baseline: bool:
This argument dictates whether all the locations are used as part of the calculationg for the reverse
percentage or if only the values from the ```to``` locations are used.
Returns
---------
- out: pandas dataframe:
This is the outputted data frame, complete with rp values.
'''
import time
time_col = columns['time']
location_col = columns['location']
data[time_col] = pd.to_datetime(data[time_col])
data = data.sort_values(time_col)
if all_loc_as_baseline:
times = data[time_col].values
raw_delta = raw_delta_calc(times)
locations = data[location_col].values
df_dict = {'from': locations[:-1], 'to': locations[1:], 'delta': raw_delta, time_col: times}
out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
out['date'] = out[time_col].dt.date
baseline_df = out.groupby(by='date')['delta'].apply(list).reset_index()
dates = baseline_df['date'].values
deltas = baseline_df['delta'].values
rp_col = []
for nd in range(dates.shape[0]):
date = dates[nd]
this_delta = deltas[nd]
index_baseline_end = np.where(dates <= date)[0][-1]
index_baseline_end = index_baseline_end - baseline_offset_days
index_baseline_start = index_baseline_end - baseline_length_days
if index_baseline_start < 0:
rp_col.extend([np.NAN]*len(this_delta))
else:
X_fit = np.hstack(deltas[index_baseline_start:index_baseline_end]).reshape(-1,1)
X_transform = np.asarray(this_delta).reshape(-1,1)
td = TimeDeltaDensity(sample = True, sample_size = 10000, seed = nd, verbose = False)
td.fit(X_fit)
rp_col.extend(td.transform(X_transform).reshape(-1,))
out['rp'] = rp_col
return out
else:
unique_locations = data[location_col].unique()
data['date'] = pd.to_datetime(data[time_col].dt.date)
rp_col = -1*np.ones(data.shape[0])
for location in unique_locations:
start_func = time.time()
delta_dict = rp_single_location_delta(input_df=data,
single_location=location,
baseline_length_days=7,
baseline_offset_days=0,
columns=columns,
recall_value=1)
end_func = time.time()
location_index = np.where(data[location_col] == location)[0]
for date in delta_dict:
deltas = delta_dict[pd.Timestamp(date)]
start_search = time.time()
index_add = location_index[np.where(data['date'].iloc[location_index] == pd.Timestamp(date))[0]]
end_search = time.time()
# This accounts for rp_single_location_delta function not calculating
index_add = index_add[-deltas.shape[0]:]
rp_col[index_add] = deltas.reshape(-1,)
data['rp'] = rp_col
df_dict = {'from': data[location_col].values[:-1], 'to': data[location_col].values[1:],
'rp': data['rp'].values[1:], time_col: data[time_col].values[1:]}
out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
return out
def datetime_to_clock(times):
'''
This function converts each date time in an array to a vector
of size 2 which represents the time in a continuous way. The vector represents
the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.
Arguments
---------
- times: array:
This is an array of times to be converted. The shape of this
array should be (N,) or (N,1).
Returns
---------
- out: array:
This is an array containing the transformed times. This array
will be of size (N,2). The vector represents the co-ordinates of
a cirlce for which the time would represent on a 24-hour analogue clock.
'''
times = pd.to_datetime(times, utc=True)
total_seconds = times.hour * 3600 \
+ times.minute * 60 \
+ times.second \
+ 1e-6 * times.microsecond
total_seconds = np.asarray(total_seconds)
C = 24 * 3600
x = (np.sin(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)
y = (np.cos(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)
out = np.hstack([x, y])
return out
Functions
def datetime_to_clock(times)
-
This function converts each date time in an array to a vector of size 2 which represents the time in a continuous way. The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.
Arguments
- times: array: This is an array of times to be converted. The shape of this array should be (N,) or (N,1).
Returns
- out: array: This is an array containing the transformed times. This array will be of size (N,2). The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.
Expand source code
def datetime_to_clock(times): ''' This function converts each date time in an array to a vector of size 2 which represents the time in a continuous way. The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock. Arguments --------- - times: array: This is an array of times to be converted. The shape of this array should be (N,) or (N,1). Returns --------- - out: array: This is an array containing the transformed times. This array will be of size (N,2). The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock. ''' times = pd.to_datetime(times, utc=True) total_seconds = times.hour * 3600 \ + times.minute * 60 \ + times.second \ + 1e-6 * times.microsecond total_seconds = np.asarray(total_seconds) C = 24 * 3600 x = (np.sin(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1) y = (np.cos(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1) out = np.hstack([x, y]) return out
def raw_delta_calc(times)
-
Given an array of times, this function calculates the deltas between them.
Arguments
- times: array: This is an array of times that will be used to calculate the deltas.
Returns
- out: array: This is an array of deltas.
Expand source code
def raw_delta_calc(times): ''' Given an array of times, this function calculates the deltas between them. Arguments --------- - times: array: This is an array of times that will be used to calculate the deltas. Returns --------- - out: array: This is an array of deltas. ''' out = (times[1:] - times[:-1])*1e-9 out = out.astype(float) return out
def rp_location_delta(data, columns={'time': 'start_date', 'location': 'location_name'}, baseline_length_days=7, baseline_offset_days=0, all_loc_as_baseline=False)
-
This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline. This function allows you to specify whether to calculate the rp values based on the deltas to the same location or whether to calculate them using all locations.
Arguments
-
data: pandas dataframe: This is the dataframe containing the time and locations that will be used to calculate the reverse percentage deltas
-
columns: dictionary: This is the dictionary with the column names in
input_df
for each of the values of data that we need in our calculations. This dictionary should be of the form:{'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
-
baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the
baseline_length_days
complete days ofsingle_location
data to use as a baseline. -
baseline_offset_days: integer: This is the offset to the baseline period.
0
corresponds to a time period ending the morning of the current date being calculated on. -
all_loc_as_baseline: bool: This argument dictates whether all the locations are used as part of the calculationg for the reverse percentage or if only the values from the
to
locations are used.
Returns
- out: pandas dataframe: This is the outputted data frame, complete with rp values.
Expand source code
def rp_location_delta(data, columns = {'time': 'start_date', 'location': 'location_name'}, baseline_length_days = 7, baseline_offset_days = 0, all_loc_as_baseline = False): ''' This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline. This function allows you to specify whether to calculate the rp values based on the deltas to the same location or whether to calculate them using all locations. Arguments --------- - data: pandas dataframe: This is the dataframe containing the time and locations that will be used to calculate the reverse percentage deltas - columns: dictionary: This is the dictionary with the column names in ```input_df``` for each of the values of data that we need in our calculations. This dictionary should be of the form: ``` {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers} ``` - baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline. - baseline_offset_days: integer: This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the current date being calculated on. - all_loc_as_baseline: bool: This argument dictates whether all the locations are used as part of the calculationg for the reverse percentage or if only the values from the ```to``` locations are used. Returns --------- - out: pandas dataframe: This is the outputted data frame, complete with rp values. ''' import time time_col = columns['time'] location_col = columns['location'] data[time_col] = pd.to_datetime(data[time_col]) data = data.sort_values(time_col) if all_loc_as_baseline: times = data[time_col].values raw_delta = raw_delta_calc(times) locations = data[location_col].values df_dict = {'from': locations[:-1], 'to': locations[1:], 'delta': raw_delta, time_col: times} out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()])) out['date'] = out[time_col].dt.date baseline_df = out.groupby(by='date')['delta'].apply(list).reset_index() dates = baseline_df['date'].values deltas = baseline_df['delta'].values rp_col = [] for nd in range(dates.shape[0]): date = dates[nd] this_delta = deltas[nd] index_baseline_end = np.where(dates <= date)[0][-1] index_baseline_end = index_baseline_end - baseline_offset_days index_baseline_start = index_baseline_end - baseline_length_days if index_baseline_start < 0: rp_col.extend([np.NAN]*len(this_delta)) else: X_fit = np.hstack(deltas[index_baseline_start:index_baseline_end]).reshape(-1,1) X_transform = np.asarray(this_delta).reshape(-1,1) td = TimeDeltaDensity(sample = True, sample_size = 10000, seed = nd, verbose = False) td.fit(X_fit) rp_col.extend(td.transform(X_transform).reshape(-1,)) out['rp'] = rp_col return out else: unique_locations = data[location_col].unique() data['date'] = pd.to_datetime(data[time_col].dt.date) rp_col = -1*np.ones(data.shape[0]) for location in unique_locations: start_func = time.time() delta_dict = rp_single_location_delta(input_df=data, single_location=location, baseline_length_days=7, baseline_offset_days=0, columns=columns, recall_value=1) end_func = time.time() location_index = np.where(data[location_col] == location)[0] for date in delta_dict: deltas = delta_dict[pd.Timestamp(date)] start_search = time.time() index_add = location_index[np.where(data['date'].iloc[location_index] == pd.Timestamp(date))[0]] end_search = time.time() # This accounts for rp_single_location_delta function not calculating index_add = index_add[-deltas.shape[0]:] rp_col[index_add] = deltas.reshape(-1,) data['rp'] = rp_col df_dict = {'from': data[location_col].values[:-1], 'to': data[location_col].values[1:], 'rp': data['rp'].values[1:], time_col: data[time_col].values[1:]} out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()])) return out
-
def rp_single_location_delta(input_df, single_location, baseline_length_days=7, baseline_offset_days=0, columns={'time': 'time', 'location': 'location'}, recall_value=5)
-
This function takes the
input_df
and calculates the reverse percentage time delta between thesingle_location
location time and the time of therecall_value
number of locations immediately before thesingle_location
. The baseline for the reverse percentage calculation is defined bybaseline_length_days
andbaseline_offset_days
.For example: With
baseline_length_days = 7
andbaseline_offset_days = 1
, the rp deltas on the daypd.Timestamp('2021-06-29')
are calculated using the deltas frompd.Timestamp('2021-06-21 00:00:00')
topd.Timestamp('2021-06-28 00:00:00')
.This does not separate on subject. Please pass data from a single subject into this function.
NOTE: The reverse percentage is calculated based on all of the deltas coming into a location! This means that the delta is agnostic to the "from" location.
Arguments
-
input_df: pandas dataframe: This is a dataframe that contains columns relating to the time and location of sensor trigger.
-
single_location: string: This is the location value that you wish to calculate the delta to.
-
baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the
baseline_length_days
complete days ofsingle_location
data to use as a baseline. -
baseline_offset_days: integer: This is the offset to the baseline period.
0
corresponds to a time period ending the morning of the current date being calculated on. -
columns: dictionary: This is the dictionary with the column names in
input_df
for each of the values of data that we need in our calculations. This dictionary should be of the form:{'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
-
recall_value: integer: This is the number of previous locations to the single_location trigger
Returns
- out: dictionary:
This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the
arrays of deltas as values. The arrays of deltas are of shape
(Nt, recall_value)
where Nt is the number of visits tosingle_location
on a given day.
Expand source code
def rp_single_location_delta(input_df, single_location, baseline_length_days = 7, baseline_offset_days = 0, columns={'time': 'time', 'location': 'location'}, recall_value=5): ''' This function takes the ```input_df``` and calculates the reverse percentage time delta between the ```single_location``` location time and the time of the ```recall_value``` number of locations immediately before the ```single_location```. The baseline for the reverse percentage calculation is defined by ```baseline_length_days``` and ```baseline_offset_days```. For example: With ```baseline_length_days = 7``` and ```baseline_offset_days = 1```, the rp deltas on the day ```pd.Timestamp('2021-06-29')``` are calculated using the deltas from ```pd.Timestamp('2021-06-21 00:00:00')``` to ```pd.Timestamp('2021-06-28 00:00:00')```. This does not separate on subject. Please pass data from a single subject into this function. NOTE: The reverse percentage is calculated based on all of the deltas coming into a location! This means that the delta is agnostic to the "from" location. Arguments --------- - input_df: pandas dataframe: This is a dataframe that contains columns relating to the time and location of sensor trigger. - single_location: string: This is the location value that you wish to calculate the delta to. - baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline. - baseline_offset_days: integer: This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the current date being calculated on. - columns: dictionary: This is the dictionary with the column names in ```input_df``` for each of the values of data that we need in our calculations. This dictionary should be of the form: ``` {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers} ``` - recall_value: integer: This is the number of previous locations to the single_location trigger Returns --------- - out: dictionary: This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the number of visits to ```single_location``` on a given day. ''' # column names time_column = columns['time'] location_column = columns['location'] out = {} # format the incoming data to ensure assumptions about structure are met input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True) input_df = input_df.sort_values(time_column) # getting the single location raw delta date_list, data_list = single_location_delta(input_df, single_location, columns, recall_value, return_as_list=True) # for each date for nd, date in enumerate(date_list): date = pd.to_datetime(date) ''' if len(baseline_offset)>0: baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length) - pd.Timedelta(**baseline_offset)) baseline_end_tp = pd.to_datetime(date - pd.Timedelta(**baseline_offset)) else: baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length)) baseline_end_tp = pd.to_datetime(date) ''' index_baseline_end = np.where(date_list <= date)[0][-1] index_baseline_end = index_baseline_end - baseline_offset_days index_baseline_start = index_baseline_end - baseline_length_days if index_baseline_start < 0: out[date] = -1*np.ones_like(data_list[nd]) continue baseline_delta = np.vstack([data_list[index] for index in range(index_baseline_start, index_baseline_end)]) td = TimeDeltaDensity(save_baseline_array=True, sample=True, sample_size=10000, seed=None, verbose=False) td.fit(baseline_delta) out[date] = td.transform(data_list[nd]) return out
-
def single_location_delta(input_df, single_location, columns={'time': 'time', 'location': 'location'}, recall_value=5, return_as_list=False)
-
This function takes the
input_df
and calculates the raw time delta between the single_location location time and the time of therecall_value
number of locations immediately before the single_location.This does not separate on subject. Please pass data from a single subject into this function.
Arguments
-
input_df: pandas dataframe: This is a dataframe that contains columns relating to the subject, time and location of sensor trigger.
-
single_location: string: This is the location value that you wish to calculate the delta to.
-
columns: dictionary: This is the dictionary with the column names in
input_df
for each of the values of data that we need in our calculations. This dictionary should be of the form:{'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
-
recall_value: integer: This is the number of previous locations to the single_location trigger
-
return_as_list: bool: This option allows the user to return a list of the dates and data if
True
. This is used internally by other functions.
Returns
- out: dictionary:
This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the
arrays of deltas as values. The arrays of deltas are of shape
(Nt, recall_value)
where Nt is the number of visits tosingle_location
on a given day. If there are nosingle_location
visits found in the data, then an empty dictionary will be returned.
Expand source code
def single_location_delta(input_df, single_location, columns={'time': 'time', 'location': 'location'}, recall_value=5, return_as_list = False): ''' This function takes the ```input_df``` and calculates the raw time delta between the single_location location time and the time of the ```recall_value``` number of locations immediately before the single_location. This does not separate on subject. Please pass data from a single subject into this function. Arguments --------- - input_df: pandas dataframe: This is a dataframe that contains columns relating to the subject, time and location of sensor trigger. - single_location: string: This is the location value that you wish to calculate the delta to. - columns: dictionary: This is the dictionary with the column names in ```input_df``` for each of the values of data that we need in our calculations. This dictionary should be of the form: ``` {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers} ``` - recall_value: integer: This is the number of previous locations to the single_location trigger - return_as_list: bool: This option allows the user to return a list of the dates and data if ```True```. This is used internally by other functions. Returns --------- - out: dictionary: This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the number of visits to ```single_location``` on a given day. If there are no ```single_location``` visits found in the data, then an empty dictionary will be returned. ''' time_column = columns['time'] location_column = columns['location'] # format the incoming data to ensure assumptions about structure are met input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True) input_df = input_df.sort_values(time_column) # find the indices of the data that match with the location we want to find the delta to single_location_indices = np.where(input_df[location_column] == single_location)[0].reshape(-1, 1) # making sure that the recall value is not more than the number of sensor triggers before the # first single_location sensor trigger if len(single_location_indices) == 0: if return_as_list: return [], [] else: return {} single_location_indices = single_location_indices[np.argmax(recall_value < single_location_indices):] # indices of the sensor triggers that we need in our calculations recall_indices = np.hstack([single_location_indices - i for i in range(recall_value + 1)]) # the times of the sensor triggers recall_times = input_df[time_column].values[recall_indices] # the delta between the times for each of the previous sensors to recall_value recall_delta = (recall_times[:, 0, None] - recall_times[:, 1:]) * 1e-9 # the times of the single_location triggers single_location_times = input_df[time_column].iloc[single_location_indices.reshape(-1, )] # dates of the single_location triggers single_location_dates = single_location_times.dt.date # out dictionary out = {} if return_as_list: date_list = [] data_list = [] for nd, date in enumerate(single_location_dates.unique()): date_list.append(date) data_to_add = recall_delta[single_location_dates.values == date].astype(float) data_list.append(data_to_add) return pd.to_datetime(date_list), data_list else: # creating the output dictionary for date in single_location_dates.unique(): # saving the delta values for this date to the dictionary out[pd.to_datetime(date)] = recall_delta[single_location_dates.values == date].astype(float) return out
-
Classes
class TimeDeltaDensity (save_baseline_array=True, sample=False, sample_size=10000, seed=None, verbose=True)
-
This function allows the user to calculate reverse percentiles on some data, given another dataset.
Expand source code
class TimeDeltaDensity(BaseDensityCalc): ''' This function allows the user to calculate reverse percentiles on some data, given another dataset. ''' def __init__(self, save_baseline_array=True, sample=False, sample_size=10000, seed=None, verbose=True): BaseDensityCalc.__init__(self, save_baseline_array=save_baseline_array, sample=sample, sample_size=sample_size, seed=seed, verbose=verbose) return
Ancestors
Inherited members