Module `minder_utils.feature_engineering.TimeFunctions`

Expand source code

import numpy as np
import pandas as pd
from .DensityFunctions import BaseDensityCalc


def raw_delta_calc(times):
    '''
    Given an array of times, this function calculates the deltas between them.

    Arguments
    ---------
    
    - times: array:
        This is an array of times that will be used to calculate the deltas.

    Returns
    ---------

    - out: array:
        This is an array of deltas.

    '''

    out = (times[1:] - times[:-1])*1e-9

    out = out.astype(float)


    return out



def single_location_delta(input_df, single_location, 
                            columns={'time': 'time', 'location': 'location'}, recall_value=5, 
                            return_as_list = False):
    '''
    This function takes the ```input_df``` and calculates the raw time delta between the single_location location time
    and the time of the ```recall_value``` number of locations immediately before the single_location.

    This does not separate on subject. Please pass data from a single subject into this function.

    Arguments
    ---------

    - input_df: pandas dataframe:
        This is a dataframe that contains columns relating to the subject, time and location of sensor trigger.

    - single_location: string:
        This is the location value that you wish to calculate the delta to.
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - recall_value: integer:
        This is the number of previous locations to the single_location trigger

    - return_as_list: bool:
        This option allows the user to return a list of the dates and data if ```True```. This is 
        used internally by other functions.

    
    Returns
    ---------

    - out: dictionary:
        This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the 
        arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the 
        number of visits to ```single_location``` on a given day. If there are no ```single_location``` visits
        found in the data, then an empty dictionary will be returned.

    '''
    time_column = columns['time']
    location_column = columns['location']

    # format the incoming data to ensure assumptions about structure are met
    input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
    input_df = input_df.sort_values(time_column)

    # find the indices of the data that match with the location we want to find the delta to
    single_location_indices = np.where(input_df[location_column] == single_location)[0].reshape(-1, 1)
    # making sure that the recall value is not more than the number of sensor triggers before the
    # first single_location sensor trigger
    if len(single_location_indices) ==  0:
        
        if return_as_list: return [], []
        else: return {}

    single_location_indices = single_location_indices[np.argmax(recall_value < single_location_indices):]

    # indices of the sensor triggers that we need in our calculations
    recall_indices = np.hstack([single_location_indices - i for i in range(recall_value + 1)])

    # the times of the sensor triggers
    recall_times = input_df[time_column].values[recall_indices]

    # the delta between the times for each of the previous sensors to recall_value
    recall_delta = (recall_times[:, 0, None] - recall_times[:, 1:]) * 1e-9

    # the times of the single_location triggers
    single_location_times = input_df[time_column].iloc[single_location_indices.reshape(-1, )]
    # dates of the single_location triggers
    single_location_dates = single_location_times.dt.date

    # out dictionary
    out = {}


    if return_as_list:
        date_list = []
        data_list = []
        for nd, date in enumerate(single_location_dates.unique()):
            date_list.append(date)
            data_to_add = recall_delta[single_location_dates.values == date].astype(float)
            data_list.append(data_to_add)
        
        return pd.to_datetime(date_list), data_list


    else:
        # creating the output dictionary
        for date in single_location_dates.unique():
            # saving the delta values for this date to the dictionary
            out[pd.to_datetime(date)] = recall_delta[single_location_dates.values == date].astype(float)

        return out


class TimeDeltaDensity(BaseDensityCalc):
    '''
    This function allows the user to calculate reverse percentiles on some data, given another
    dataset.

    '''

    def __init__(self, save_baseline_array=True, sample=False, sample_size=10000,
                 seed=None, verbose=True):
        BaseDensityCalc.__init__(self, save_baseline_array=save_baseline_array,
                                 sample=sample, sample_size=sample_size, seed=seed, verbose=verbose)

        return




def rp_single_location_delta(input_df, single_location, baseline_length_days = 7, baseline_offset_days = 0,
                             columns={'time': 'time', 'location': 'location'}, recall_value=5):
    '''
    This function takes the ```input_df``` and calculates the reverse percentage time delta between the ```single_location``` location time
    and the time of the ```recall_value``` number of locations immediately before the ```single_location```. The baseline
    for the reverse percentage calculation is defined by ```baseline_length_days``` and ```baseline_offset_days```. 

    For example:
    With ```baseline_length_days = 7``` and ```baseline_offset_days = 1```, the rp deltas on the day
    ```pd.Timestamp('2021-06-29')``` are calculated using the deltas from 
    ```pd.Timestamp('2021-06-21 00:00:00')``` to ```pd.Timestamp('2021-06-28 00:00:00')```.

    This does not separate on subject. Please pass data from a single subject into this function.

    NOTE: The reverse percentage is calculated based on all of the deltas coming into a location!
    This means that the delta is agnostic to the "from" location.

    Arguments
    ---------

    - input_df: pandas dataframe:
        This is a dataframe that contains columns relating to the time and location of sensor trigger.

    - single_location: string:
        This is the location value that you wish to calculate the delta to.
    
    - baseline_length_days: integer:
        This is the length of the baseline in days that will be used. This value is used when finding
        the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
    
    - baseline_offset_days: integer:
        This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
        current date being calculated on.
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - recall_value: integer:
        This is the number of previous locations to the single_location trigger
    
    
    Returns
    ---------

    - out: dictionary:
        This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the 
        arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the 
        number of visits to ```single_location``` on a given day.

    '''


    # column names
    time_column = columns['time']
    location_column = columns['location']

    out = {}

    # format the incoming data to ensure assumptions about structure are met
    input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
    input_df = input_df.sort_values(time_column)

    # getting the single location raw delta
    date_list, data_list = single_location_delta(input_df, single_location, columns, recall_value, return_as_list=True)

    # for each date
    for nd, date in enumerate(date_list):
        date = pd.to_datetime(date)


        '''
        if len(baseline_offset)>0:
            baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length) - pd.Timedelta(**baseline_offset))
            baseline_end_tp = pd.to_datetime(date - pd.Timedelta(**baseline_offset))
        else:
            baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length))
            baseline_end_tp = pd.to_datetime(date)
        '''

        

        index_baseline_end = np.where(date_list <= date)[0][-1]
        
        index_baseline_end = index_baseline_end - baseline_offset_days
        index_baseline_start = index_baseline_end - baseline_length_days

        if index_baseline_start < 0:
            out[date] = -1*np.ones_like(data_list[nd])
            continue

        
        baseline_delta = np.vstack([data_list[index] for index in range(index_baseline_start, index_baseline_end)])

        
        td = TimeDeltaDensity(save_baseline_array=True, sample=True, sample_size=10000,
                                seed=None, verbose=False)

        td.fit(baseline_delta)
        out[date] = td.transform(data_list[nd])
        
    return out


def rp_location_delta(data, columns = {'time': 'start_date', 'location': 'location_name'}, baseline_length_days = 7,
                           baseline_offset_days = 0, all_loc_as_baseline = False):
    ''' 
    This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline.
    This function allows you to specify whether to calculate the rp values based on the deltas to the same location or 
    whether to calculate them using all locations.

    Arguments
    ---------

    - data: pandas dataframe:
        This is the dataframe containing the time and locations that will be used to calculate the reverse 
        percentage deltas
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - baseline_length_days: integer:
        This is the length of the baseline in days that will be used. This value is used when finding
        the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
    
    - baseline_offset_days: integer:
        This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
        current date being calculated on.

    - all_loc_as_baseline: bool:
        This argument dictates whether all the locations are used as part of the calculationg for the reverse
        percentage or if only the values from the ```to``` locations are used.

    Returns
    ---------
        
    - out: pandas dataframe:
        This is the outputted data frame, complete with rp values.
        
    
    '''
    
    import time
    
    time_col = columns['time']
    location_col = columns['location']

    data[time_col] = pd.to_datetime(data[time_col])

    data = data.sort_values(time_col)

    if all_loc_as_baseline:



        times = data[time_col].values
        raw_delta = raw_delta_calc(times)

        locations = data[location_col].values

        df_dict = {'from': locations[:-1], 'to': locations[1:], 'delta': raw_delta, time_col: times}

        out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
        out['date'] = out[time_col].dt.date

        baseline_df = out.groupby(by='date')['delta'].apply(list).reset_index()

        dates = baseline_df['date'].values
        deltas = baseline_df['delta'].values

        rp_col = []


        for nd in range(dates.shape[0]):
            date = dates[nd]
            this_delta = deltas[nd]
            index_baseline_end = np.where(dates <= date)[0][-1]
            index_baseline_end = index_baseline_end - baseline_offset_days
            index_baseline_start = index_baseline_end - baseline_length_days

            if index_baseline_start < 0:
                rp_col.extend([np.NAN]*len(this_delta))
            
            else:
                X_fit = np.hstack(deltas[index_baseline_start:index_baseline_end]).reshape(-1,1)
                X_transform = np.asarray(this_delta).reshape(-1,1)
                td = TimeDeltaDensity(sample = True, sample_size = 10000, seed = nd, verbose = False)
                td.fit(X_fit)
                rp_col.extend(td.transform(X_transform).reshape(-1,))

        out['rp'] = rp_col


        return out
    
    else:


        unique_locations = data[location_col].unique()
        data['date'] = pd.to_datetime(data[time_col].dt.date)

        rp_col = -1*np.ones(data.shape[0])


        for location in unique_locations:
            start_func = time.time()
            delta_dict = rp_single_location_delta(input_df=data, 
                                single_location=location, 
                                baseline_length_days=7, 
                                baseline_offset_days=0, 
                                columns=columns, 
                                recall_value=1)
            end_func = time.time()
            location_index = np.where(data[location_col] == location)[0]
            for date in delta_dict:
                deltas = delta_dict[pd.Timestamp(date)]
                
                start_search = time.time()
                index_add = location_index[np.where(data['date'].iloc[location_index] == pd.Timestamp(date))[0]]
                end_search = time.time()
                
                # This accounts for rp_single_location_delta function not calculating 
                index_add = index_add[-deltas.shape[0]:]
                rp_col[index_add] = deltas.reshape(-1,)

        data['rp'] = rp_col

        df_dict = {'from': data[location_col].values[:-1], 'to': data[location_col].values[1:],
                    'rp': data['rp'].values[1:], time_col: data[time_col].values[1:]}

        out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
        
        return out




def datetime_to_clock(times):
    '''
    This function converts each date time in an array to a vector
    of size 2 which represents the time in a continuous way.  The vector represents 
    the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.
    
    Arguments
    ---------
    
    - times: array:
        This is an array of times to be converted. The shape of this 
        array should be (N,) or (N,1).
    
    Returns
    ---------
    
    - out: array:
        This is an array containing the transformed times. This array
        will be of size (N,2). The vector represents the co-ordinates of 
        a cirlce for which the time would represent on a 24-hour analogue clock.
    
    '''

    times = pd.to_datetime(times, utc=True)

    total_seconds = times.hour * 3600 \
                    + times.minute * 60 \
                    + times.second \
                    + 1e-6 * times.microsecond
    total_seconds = np.asarray(total_seconds)

    C = 24 * 3600
    x = (np.sin(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)
    y = (np.cos(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)

    out = np.hstack([x, y])

    return out

Functions

def datetime_to_clock(times)

This function converts each date time in an array to a vector of size 2 which represents the time in a continuous way. The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.

Arguments

times: array: This is an array of times to be converted. The shape of this array should be (N,) or (N,1).

Returns

out: array: This is an array containing the transformed times. This array will be of size (N,2). The vector represents the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.

Expand source code

def datetime_to_clock(times):
    '''
    This function converts each date time in an array to a vector
    of size 2 which represents the time in a continuous way.  The vector represents 
    the co-ordinates of a cirlce for which the time would represent on a 24-hour analogue clock.
    
    Arguments
    ---------
    
    - times: array:
        This is an array of times to be converted. The shape of this 
        array should be (N,) or (N,1).
    
    Returns
    ---------
    
    - out: array:
        This is an array containing the transformed times. This array
        will be of size (N,2). The vector represents the co-ordinates of 
        a cirlce for which the time would represent on a 24-hour analogue clock.
    
    '''

    times = pd.to_datetime(times, utc=True)

    total_seconds = times.hour * 3600 \
                    + times.minute * 60 \
                    + times.second \
                    + 1e-6 * times.microsecond
    total_seconds = np.asarray(total_seconds)

    C = 24 * 3600
    x = (np.sin(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)
    y = (np.cos(2 * np.pi * total_seconds / C) + 1e-12).reshape(-1, 1)

    out = np.hstack([x, y])

    return out

def raw_delta_calc(times)

Given an array of times, this function calculates the deltas between them.

Arguments

times: array: This is an array of times that will be used to calculate the deltas.

Returns

out: array: This is an array of deltas.

Expand source code

def raw_delta_calc(times):
    '''
    Given an array of times, this function calculates the deltas between them.

    Arguments
    ---------
    
    - times: array:
        This is an array of times that will be used to calculate the deltas.

    Returns
    ---------

    - out: array:
        This is an array of deltas.

    '''

    out = (times[1:] - times[:-1])*1e-9

    out = out.astype(float)


    return out

def rp_location_delta(data, columns={'time': 'start_date', 'location': 'location_name'}, baseline_length_days=7, baseline_offset_days=0, all_loc_as_baseline=False)

This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline. This function allows you to specify whether to calculate the rp values based on the deltas to the same location or whether to calculate them using all locations.

Arguments

data: pandas dataframe: This is the dataframe containing the time and locations that will be used to calculate the reverse percentage deltas
columns: dictionary: This is the dictionary with the column names in input_df for each of the values of data that we need in our calculations. This dictionary should be of the form: {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the baseline_length_days complete days of single_location data to use as a baseline.
baseline_offset_days: integer: This is the offset to the baseline period. 0 corresponds to a time period ending the morning of the current date being calculated on.
all_loc_as_baseline: bool: This argument dictates whether all the locations are used as part of the calculationg for the reverse percentage or if only the values from the to locations are used.

Returns

out: pandas dataframe: This is the outputted data frame, complete with rp values.

Expand source code

def rp_location_delta(data, columns = {'time': 'start_date', 'location': 'location_name'}, baseline_length_days = 7,
                           baseline_offset_days = 0, all_loc_as_baseline = False):
    ''' 
    This funciton allows you to calculate the reverse percentage of the delta for each of the locations based on a baseline.
    This function allows you to specify whether to calculate the rp values based on the deltas to the same location or 
    whether to calculate them using all locations.

    Arguments
    ---------

    - data: pandas dataframe:
        This is the dataframe containing the time and locations that will be used to calculate the reverse 
        percentage deltas
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - baseline_length_days: integer:
        This is the length of the baseline in days that will be used. This value is used when finding
        the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
    
    - baseline_offset_days: integer:
        This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
        current date being calculated on.

    - all_loc_as_baseline: bool:
        This argument dictates whether all the locations are used as part of the calculationg for the reverse
        percentage or if only the values from the ```to``` locations are used.

    Returns
    ---------
        
    - out: pandas dataframe:
        This is the outputted data frame, complete with rp values.
        
    
    '''
    
    import time
    
    time_col = columns['time']
    location_col = columns['location']

    data[time_col] = pd.to_datetime(data[time_col])

    data = data.sort_values(time_col)

    if all_loc_as_baseline:



        times = data[time_col].values
        raw_delta = raw_delta_calc(times)

        locations = data[location_col].values

        df_dict = {'from': locations[:-1], 'to': locations[1:], 'delta': raw_delta, time_col: times}

        out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
        out['date'] = out[time_col].dt.date

        baseline_df = out.groupby(by='date')['delta'].apply(list).reset_index()

        dates = baseline_df['date'].values
        deltas = baseline_df['delta'].values

        rp_col = []


        for nd in range(dates.shape[0]):
            date = dates[nd]
            this_delta = deltas[nd]
            index_baseline_end = np.where(dates <= date)[0][-1]
            index_baseline_end = index_baseline_end - baseline_offset_days
            index_baseline_start = index_baseline_end - baseline_length_days

            if index_baseline_start < 0:
                rp_col.extend([np.NAN]*len(this_delta))
            
            else:
                X_fit = np.hstack(deltas[index_baseline_start:index_baseline_end]).reshape(-1,1)
                X_transform = np.asarray(this_delta).reshape(-1,1)
                td = TimeDeltaDensity(sample = True, sample_size = 10000, seed = nd, verbose = False)
                td.fit(X_fit)
                rp_col.extend(td.transform(X_transform).reshape(-1,))

        out['rp'] = rp_col


        return out
    
    else:


        unique_locations = data[location_col].unique()
        data['date'] = pd.to_datetime(data[time_col].dt.date)

        rp_col = -1*np.ones(data.shape[0])


        for location in unique_locations:
            start_func = time.time()
            delta_dict = rp_single_location_delta(input_df=data, 
                                single_location=location, 
                                baseline_length_days=7, 
                                baseline_offset_days=0, 
                                columns=columns, 
                                recall_value=1)
            end_func = time.time()
            location_index = np.where(data[location_col] == location)[0]
            for date in delta_dict:
                deltas = delta_dict[pd.Timestamp(date)]
                
                start_search = time.time()
                index_add = location_index[np.where(data['date'].iloc[location_index] == pd.Timestamp(date))[0]]
                end_search = time.time()
                
                # This accounts for rp_single_location_delta function not calculating 
                index_add = index_add[-deltas.shape[0]:]
                rp_col[index_add] = deltas.reshape(-1,)

        data['rp'] = rp_col

        df_dict = {'from': data[location_col].values[:-1], 'to': data[location_col].values[1:],
                    'rp': data['rp'].values[1:], time_col: data[time_col].values[1:]}

        out = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in df_dict.items()]))
        
        return out

def rp_single_location_delta(input_df, single_location, baseline_length_days=7, baseline_offset_days=0, columns={'time': 'time', 'location': 'location'}, recall_value=5)

This function takes the input_df and calculates the reverse percentage time delta between the single_location location time and the time of the recall_value number of locations immediately before the single_location. The baseline for the reverse percentage calculation is defined by baseline_length_days and baseline_offset_days.

For example: With baseline_length_days = 7 and baseline_offset_days = 1, the rp deltas on the day pd.Timestamp('2021-06-29') are calculated using the deltas from pd.Timestamp('2021-06-21 00:00:00') to pd.Timestamp('2021-06-28 00:00:00').

This does not separate on subject. Please pass data from a single subject into this function.

NOTE: The reverse percentage is calculated based on all of the deltas coming into a location! This means that the delta is agnostic to the "from" location.

Arguments

input_df: pandas dataframe: This is a dataframe that contains columns relating to the time and location of sensor trigger.
single_location: string: This is the location value that you wish to calculate the delta to.
baseline_length_days: integer: This is the length of the baseline in days that will be used. This value is used when finding the baseline_length_days complete days of single_location data to use as a baseline.
baseline_offset_days: integer: This is the offset to the baseline period. 0 corresponds to a time period ending the morning of the current date being calculated on.
columns: dictionary: This is the dictionary with the column names in input_df for each of the values of data that we need in our calculations. This dictionary should be of the form: {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
recall_value: integer: This is the number of previous locations to the single_location trigger

Returns

out: dictionary: This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the arrays of deltas as values. The arrays of deltas are of shape (Nt, recall_value) where Nt is the number of visits to single_location on a given day.

Expand source code

def rp_single_location_delta(input_df, single_location, baseline_length_days = 7, baseline_offset_days = 0,
                             columns={'time': 'time', 'location': 'location'}, recall_value=5):
    '''
    This function takes the ```input_df``` and calculates the reverse percentage time delta between the ```single_location``` location time
    and the time of the ```recall_value``` number of locations immediately before the ```single_location```. The baseline
    for the reverse percentage calculation is defined by ```baseline_length_days``` and ```baseline_offset_days```. 

    For example:
    With ```baseline_length_days = 7``` and ```baseline_offset_days = 1```, the rp deltas on the day
    ```pd.Timestamp('2021-06-29')``` are calculated using the deltas from 
    ```pd.Timestamp('2021-06-21 00:00:00')``` to ```pd.Timestamp('2021-06-28 00:00:00')```.

    This does not separate on subject. Please pass data from a single subject into this function.

    NOTE: The reverse percentage is calculated based on all of the deltas coming into a location!
    This means that the delta is agnostic to the "from" location.

    Arguments
    ---------

    - input_df: pandas dataframe:
        This is a dataframe that contains columns relating to the time and location of sensor trigger.

    - single_location: string:
        This is the location value that you wish to calculate the delta to.
    
    - baseline_length_days: integer:
        This is the length of the baseline in days that will be used. This value is used when finding
        the ```baseline_length_days``` complete days of ```single_location``` data to use as a baseline.
    
    - baseline_offset_days: integer:
        This is the offset to the baseline period. ```0``` corresponds to a time period ending the morning of the
        current date being calculated on.
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - recall_value: integer:
        This is the number of previous locations to the single_location trigger
    
    
    Returns
    ---------

    - out: dictionary:
        This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the 
        arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the 
        number of visits to ```single_location``` on a given day.

    '''


    # column names
    time_column = columns['time']
    location_column = columns['location']

    out = {}

    # format the incoming data to ensure assumptions about structure are met
    input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
    input_df = input_df.sort_values(time_column)

    # getting the single location raw delta
    date_list, data_list = single_location_delta(input_df, single_location, columns, recall_value, return_as_list=True)

    # for each date
    for nd, date in enumerate(date_list):
        date = pd.to_datetime(date)


        '''
        if len(baseline_offset)>0:
            baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length) - pd.Timedelta(**baseline_offset))
            baseline_end_tp = pd.to_datetime(date - pd.Timedelta(**baseline_offset))
        else:
            baseline_start_tp = pd.to_datetime(date - pd.Timedelta(**baseline_length))
            baseline_end_tp = pd.to_datetime(date)
        '''

        

        index_baseline_end = np.where(date_list <= date)[0][-1]
        
        index_baseline_end = index_baseline_end - baseline_offset_days
        index_baseline_start = index_baseline_end - baseline_length_days

        if index_baseline_start < 0:
            out[date] = -1*np.ones_like(data_list[nd])
            continue

        
        baseline_delta = np.vstack([data_list[index] for index in range(index_baseline_start, index_baseline_end)])

        
        td = TimeDeltaDensity(save_baseline_array=True, sample=True, sample_size=10000,
                                seed=None, verbose=False)

        td.fit(baseline_delta)
        out[date] = td.transform(data_list[nd])
        
    return out

def single_location_delta(input_df, single_location, columns={'time': 'time', 'location': 'location'}, recall_value=5, return_as_list=False)

This function takes the input_df and calculates the raw time delta between the single_location location time and the time of the recall_value number of locations immediately before the single_location.

This does not separate on subject. Please pass data from a single subject into this function.

Arguments

input_df: pandas dataframe: This is a dataframe that contains columns relating to the subject, time and location of sensor trigger.
single_location: string: This is the location value that you wish to calculate the delta to.
columns: dictionary: This is the dictionary with the column names in input_df for each of the values of data that we need in our calculations. This dictionary should be of the form: {'time': column containing the times of sensor triggers, 'location': column containing the locations of the sensor triggers}
recall_value: integer: This is the number of previous locations to the single_location trigger
return_as_list: bool: This option allows the user to return a list of the dates and data if True. This is used internally by other functions.

Returns

out: dictionary: This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the arrays of deltas as values. The arrays of deltas are of shape (Nt, recall_value) where Nt is the number of visits to single_location on a given day. If there are no single_location visits found in the data, then an empty dictionary will be returned.

Expand source code

def single_location_delta(input_df, single_location, 
                            columns={'time': 'time', 'location': 'location'}, recall_value=5, 
                            return_as_list = False):
    '''
    This function takes the ```input_df``` and calculates the raw time delta between the single_location location time
    and the time of the ```recall_value``` number of locations immediately before the single_location.

    This does not separate on subject. Please pass data from a single subject into this function.

    Arguments
    ---------

    - input_df: pandas dataframe:
        This is a dataframe that contains columns relating to the subject, time and location of sensor trigger.

    - single_location: string:
        This is the location value that you wish to calculate the delta to.
    
    - columns: dictionary:
        This is the dictionary with the column names in ```input_df``` for each of the values of data that we need 
        in our calculations.
        This dictionary should be of the form:
        ```
        {'time':      column containing the times of sensor triggers,
         'location':  column containing the locations of the sensor triggers}
         ```

    - recall_value: integer:
        This is the number of previous locations to the single_location trigger

    - return_as_list: bool:
        This option allows the user to return a list of the dates and data if ```True```. This is 
        used internally by other functions.

    
    Returns
    ---------

    - out: dictionary:
        This has the Timestamps of the dates as keys (for example: Timestamp('2021-05-05 00:00:00')) and the 
        arrays of deltas as values. The arrays of deltas are of shape ```(Nt, recall_value)``` where Nt is the 
        number of visits to ```single_location``` on a given day. If there are no ```single_location``` visits
        found in the data, then an empty dictionary will be returned.

    '''
    time_column = columns['time']
    location_column = columns['location']

    # format the incoming data to ensure assumptions about structure are met
    input_df[time_column] = pd.to_datetime(input_df[time_column], utc=True)
    input_df = input_df.sort_values(time_column)

    # find the indices of the data that match with the location we want to find the delta to
    single_location_indices = np.where(input_df[location_column] == single_location)[0].reshape(-1, 1)
    # making sure that the recall value is not more than the number of sensor triggers before the
    # first single_location sensor trigger
    if len(single_location_indices) ==  0:
        
        if return_as_list: return [], []
        else: return {}

    single_location_indices = single_location_indices[np.argmax(recall_value < single_location_indices):]

    # indices of the sensor triggers that we need in our calculations
    recall_indices = np.hstack([single_location_indices - i for i in range(recall_value + 1)])

    # the times of the sensor triggers
    recall_times = input_df[time_column].values[recall_indices]

    # the delta between the times for each of the previous sensors to recall_value
    recall_delta = (recall_times[:, 0, None] - recall_times[:, 1:]) * 1e-9

    # the times of the single_location triggers
    single_location_times = input_df[time_column].iloc[single_location_indices.reshape(-1, )]
    # dates of the single_location triggers
    single_location_dates = single_location_times.dt.date

    # out dictionary
    out = {}


    if return_as_list:
        date_list = []
        data_list = []
        for nd, date in enumerate(single_location_dates.unique()):
            date_list.append(date)
            data_to_add = recall_delta[single_location_dates.values == date].astype(float)
            data_list.append(data_to_add)
        
        return pd.to_datetime(date_list), data_list


    else:
        # creating the output dictionary
        for date in single_location_dates.unique():
            # saving the delta values for this date to the dictionary
            out[pd.to_datetime(date)] = recall_delta[single_location_dates.values == date].astype(float)

        return out

Classes

class TimeDeltaDensity (save_baseline_array=True, sample=False, sample_size=10000, seed=None, verbose=True)

This function allows the user to calculate reverse percentiles on some data, given another dataset.

Expand source code

class TimeDeltaDensity(BaseDensityCalc):
    '''
    This function allows the user to calculate reverse percentiles on some data, given another
    dataset.

    '''

    def __init__(self, save_baseline_array=True, sample=False, sample_size=10000,
                 seed=None, verbose=True):
        BaseDensityCalc.__init__(self, save_baseline_array=save_baseline_array,
                                 sample=sample, sample_size=sample_size, seed=seed, verbose=verbose)

        return

Ancestors

BaseDensityCalc

Inherited members

BaseDensityCalc:
- fit
- predict
- threshold_counter
- transform