Module minder_utils.dataloader.dataloader

Expand source code
import pandas as pd
import numpy as np
import warnings
import datetime
from minder_utils.util import load_save
from minder_utils.formatting.format_util import normalise as normalized
from minder_utils.formatting.label import label_dataframe
from minder_utils.formatting.standardisation import standardise_physiological_environmental, standardise_activity_data
from minder_utils.configurations import config


class Dataloader:
    """
    Categorise the data into labelled & unlabelled data.
    This dataloader should be used combined with minder_utils.formatting.Formatting.

    After initialising ```formater = Formatting()```
    Parameters:
        - activity: activity data, ```formater.activity```
        - physiological: physiological data, ```formater.physiological```
        - environmental: environmental data, ```formater.environmental```
        - max_days: Default 3. How many consecutive days to extended as UTI, if ```max_days = n```, ```n``` days before & after
            the validated date will be labelled as UTI
        - label_data: Default False. label the data or not. If False, ```get_labelled_data()``` cannot be used.
    """

    def __init__(self, activity, physiological=None, environmental=None, max_days=3, label_data=False):
        if activity is None:
            warnings.warn('Activity data is None, this class can be only used to load the processed data')
            return
        activity = pd.read_csv(activity) if type(activity) == str else activity
        shared_id = None
        for data in [activity, physiological, environmental]:
            if data is None:
                continue
            shared_id = set(data.id.unique()) if shared_id is None else shared_id.intersection(set(data.id.unique()))
        activity = activity[activity.id.isin(shared_id)]
        activity = standardise_activity_data(activity)
        activity.time = pd.to_datetime(activity.time)
        activity.loc[:, 'Date'] = activity.time.dt.date
        date_range = pd.date_range(activity.Date.min(), activity.Date.max())

        self.physiological = standardise_physiological_environmental(physiological, date_range, shared_id) \
            if physiological is not None else physiological
        self.environmental = standardise_physiological_environmental(environmental, date_range, shared_id) \
            if environmental is not None else environmental

        for datatype in ['environmental', 'physiological']:
            config[datatype]['sort_dict'] = dict(
                zip(config[datatype]['sensors'], range(len(config[datatype]['sensors']))))

        if label_data:
            activity = label_dataframe(activity)
            self.labelled_df = activity[~activity.valid.isna()]
            self.labelled_df.set_index(['id', 'valid', 'Date'], inplace=True)
            if len(self.labelled_df) > 0:
                self.true_p_ids = self.labelled_df.loc[:, True, :].index.get_level_values(0).unique()
                self.false_p_ids = self.labelled_df.loc[:, False, :].index.get_level_values(0).unique()
            else:
                print('no data is labelled')

        activity.set_index(['id', 'Date'], inplace=True)
        self.activity = activity
        self.max_days = max_days

        self.transfer_sensors = ['back door', 'bathroom1', 'bedroom1', 'dining room',
                                 'fridge door', 'front door', 'hallway', 'kettle', 'kitchen',
                                 'living room', 'lounge', 'microwave', 'study', 'toaster']
        self.select_sensors = config['activity']['sensors']

    def __len__(self):
        return int(len(self.labelled_df) / 24)

    @property
    @load_save(**config['labelled_data']['save'])
    def labelled_data(self):
        activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates = \
            self.get_labelled_data(normalise=False)
        return {
            'activity': activity_data,
            'phy': physiological_data,
            'env': environmental_data,
            'p_ids': patient_ids,
            'uti_labels': uti_labels,
            'dates': labelled_dates
        }

    @property
    @load_save(**config['unlabelled_data']['save'])
    def unlabelled_data(self):
        activity_data, physiological_data, environmental_data, patient_ids, dates = \
            self.get_unlabelled_data(normalise=False)
        return {
            'activity': activity_data,
            'phy': physiological_data,
            'env': environmental_data,
            'p_ids': patient_ids,
            'dates': dates
        }

    def get_labelled_data(self, normalise=False):
        # get p ids
        p_ids = self.labelled_df.index.get_level_values(0).unique()
        activity_data, uti_labels, patient_ids, physiological_data, environmental_data, labelled_dates = [], [], [], [], [], []
        for idx in range(len(p_ids)):
            # get data of patient
            data = self.labelled_df.loc[p_ids[idx]]
            for valid in data.index.get_level_values(0).unique():
                dates = data.loc[valid].index.get_level_values(0).unique()
                for date in dates:
                    # validated data
                    act_data, labels, patient, phy_data, env_data = [], [], [], [], []
                    p_date = date
                    p_data = data.loc[(valid, p_date), self.select_sensors].to_numpy()
                    if normalise:
                        p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                        # p_data = normalized(np.array(p_data))
                    act_data.append(p_data)
                    phy_data.append(self.get_data(self.physiological, p_ids[idx], p_date, 'physiological'))
                    env_data.append(self.get_data(self.environmental, p_ids[idx], p_date, 'environmental'))
                    labels.append(int(valid) if valid else -1)
                    patient.append(p_ids[idx])
                    labelled_dates.append(date)
                    for i in range(1, self.max_days + 1):
                        for symbol in [-1, 1]:
                            f_date = p_date - datetime.timedelta(i) * symbol
                            try:
                                p_data = self.activity.loc[(p_ids[idx], f_date), self.select_sensors].to_numpy()
                                if normalise:
                                    p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                                    # p_data = normalized(np.array(p_data), axis=-1).reshape(3, 8, -1)
                                act_data.append(p_data)
                                phy_data.append(self.get_data(self.physiological, p_ids[idx], f_date, 'physiological'))
                                env_data.append(self.get_data(self.environmental, p_ids[idx], f_date, 'environmental'))
                                labels.append(self.laplace_smooth(i) * symbol)
                                patient.append(p_ids[idx])
                            except KeyError:
                                break
                    activity_data.append(act_data)
                    uti_labels.append(labels)
                    patient_ids.append(patient)
                    physiological_data.append(phy_data)
                    environmental_data.append(env_data)

        activity_data = np.array(activity_data)
        uti_labels = np.array(uti_labels)
        patient_ids = np.array(patient_ids)
        physiological_data = np.array(physiological_data)
        environmental_data = np.array(environmental_data)
        labelled_dates = np.array(labelled_dates)

        return activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates

    def get_unlabelled_data(self, normalise=False, date='2021-03-01'):
        '''
        Get the unlabelled data,
        Parameters
        ----------
        normalise: bool, normalise the data or not
        date: str, only return the data later than the date provided. By default,
            it will not return the tihm unlabelled

        Returns activity, physiological, environmental data, patient ids, dates
        -------

        '''
        # May need to change the for loop to dataframe operations
        # df = self.activity.reset_index().set_index(['id', 'Date'])
        # phy_df = self.physiological.reset_index()
        # phy_df = phy_df.pivot_table(index=['id', 'time'], columns='location',
        #                             values='value').reset_index().rename(columns={'time': 'Date'})
        # indices = df.reset_index()[['id', 'Date']].drop_duplicates()
        # get p ids
        df = self.activity.reset_index().set_index(['id', 'Date'])
        if date is not None:
            df = df[df.index.get_level_values(1) > date]

        p_ids = df.index.get_level_values(0).unique()
        outputs = []
        phy_data, env_data = [], []
        outputs_p_ids = []
        outputs_dates = []
        for idx in range(len(p_ids)):
            # get data of patient
            data = df.loc[p_ids[idx]]
            dates = data.index.get_level_values(0).unique()
            for date in dates:
                # validated data
                p_data = data.loc[date, self.select_sensors].to_numpy()
                if normalise:
                    # p_data = torch.Tensor(np.array(p_data))
                    # p_data = F.normalize(p_data, p=2, dim=-1)
                    p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                    # p_data = np.array(p_data)
                    # p_data = normalize(p_data, axis=2)
                outputs.append(p_data)
                phy_data.append(self.get_data(self.physiological, p_ids[idx], date, 'physiological'))
                env_data.append(self.get_data(self.environmental, p_ids[idx], date, 'environmental'))
                outputs_p_ids.append(p_ids[idx])
                outputs_dates.append(date)
        return np.array(outputs), np.array(phy_data), np.array(env_data), \
               np.array(outputs_p_ids), None, np.array(outputs_dates)

    @staticmethod
    def laplace_smooth(i, lam=3, denominator=1):
        return np.exp(- np.abs(i) / lam) / denominator

    @staticmethod
    def get_data(df, p_id, date, datatype):
        if df is None:
            return
        try:
            return df.loc[(p_id, date, config[datatype]['sensors'])] \
                .sort_values('location', key=lambda x: x.map(config[datatype]['sort_dict']))['value'].to_numpy()
        except KeyError:
            return [0.] * len(config[datatype]['sensors'])

Classes

class Dataloader (activity, physiological=None, environmental=None, max_days=3, label_data=False)

Categorise the data into labelled & unlabelled data. This dataloader should be used combined with minder_utils.formatting.Formatting.

After initialising formater = Formatting()

Parameters

  • activity: activity data, formater.activity
  • physiological: physiological data, formater.physiological
  • environmental: environmental data, formater.environmental
  • max_days: Default 3. How many consecutive days to extended as UTI, if max_days = n, n days before & after the validated date will be labelled as UTI
  • label_data: Default False. label the data or not. If False, get_labelled_data() cannot be used.
Expand source code
class Dataloader:
    """
    Categorise the data into labelled & unlabelled data.
    This dataloader should be used combined with minder_utils.formatting.Formatting.

    After initialising ```formater = Formatting()```
    Parameters:
        - activity: activity data, ```formater.activity```
        - physiological: physiological data, ```formater.physiological```
        - environmental: environmental data, ```formater.environmental```
        - max_days: Default 3. How many consecutive days to extended as UTI, if ```max_days = n```, ```n``` days before & after
            the validated date will be labelled as UTI
        - label_data: Default False. label the data or not. If False, ```get_labelled_data()``` cannot be used.
    """

    def __init__(self, activity, physiological=None, environmental=None, max_days=3, label_data=False):
        if activity is None:
            warnings.warn('Activity data is None, this class can be only used to load the processed data')
            return
        activity = pd.read_csv(activity) if type(activity) == str else activity
        shared_id = None
        for data in [activity, physiological, environmental]:
            if data is None:
                continue
            shared_id = set(data.id.unique()) if shared_id is None else shared_id.intersection(set(data.id.unique()))
        activity = activity[activity.id.isin(shared_id)]
        activity = standardise_activity_data(activity)
        activity.time = pd.to_datetime(activity.time)
        activity.loc[:, 'Date'] = activity.time.dt.date
        date_range = pd.date_range(activity.Date.min(), activity.Date.max())

        self.physiological = standardise_physiological_environmental(physiological, date_range, shared_id) \
            if physiological is not None else physiological
        self.environmental = standardise_physiological_environmental(environmental, date_range, shared_id) \
            if environmental is not None else environmental

        for datatype in ['environmental', 'physiological']:
            config[datatype]['sort_dict'] = dict(
                zip(config[datatype]['sensors'], range(len(config[datatype]['sensors']))))

        if label_data:
            activity = label_dataframe(activity)
            self.labelled_df = activity[~activity.valid.isna()]
            self.labelled_df.set_index(['id', 'valid', 'Date'], inplace=True)
            if len(self.labelled_df) > 0:
                self.true_p_ids = self.labelled_df.loc[:, True, :].index.get_level_values(0).unique()
                self.false_p_ids = self.labelled_df.loc[:, False, :].index.get_level_values(0).unique()
            else:
                print('no data is labelled')

        activity.set_index(['id', 'Date'], inplace=True)
        self.activity = activity
        self.max_days = max_days

        self.transfer_sensors = ['back door', 'bathroom1', 'bedroom1', 'dining room',
                                 'fridge door', 'front door', 'hallway', 'kettle', 'kitchen',
                                 'living room', 'lounge', 'microwave', 'study', 'toaster']
        self.select_sensors = config['activity']['sensors']

    def __len__(self):
        return int(len(self.labelled_df) / 24)

    @property
    @load_save(**config['labelled_data']['save'])
    def labelled_data(self):
        activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates = \
            self.get_labelled_data(normalise=False)
        return {
            'activity': activity_data,
            'phy': physiological_data,
            'env': environmental_data,
            'p_ids': patient_ids,
            'uti_labels': uti_labels,
            'dates': labelled_dates
        }

    @property
    @load_save(**config['unlabelled_data']['save'])
    def unlabelled_data(self):
        activity_data, physiological_data, environmental_data, patient_ids, dates = \
            self.get_unlabelled_data(normalise=False)
        return {
            'activity': activity_data,
            'phy': physiological_data,
            'env': environmental_data,
            'p_ids': patient_ids,
            'dates': dates
        }

    def get_labelled_data(self, normalise=False):
        # get p ids
        p_ids = self.labelled_df.index.get_level_values(0).unique()
        activity_data, uti_labels, patient_ids, physiological_data, environmental_data, labelled_dates = [], [], [], [], [], []
        for idx in range(len(p_ids)):
            # get data of patient
            data = self.labelled_df.loc[p_ids[idx]]
            for valid in data.index.get_level_values(0).unique():
                dates = data.loc[valid].index.get_level_values(0).unique()
                for date in dates:
                    # validated data
                    act_data, labels, patient, phy_data, env_data = [], [], [], [], []
                    p_date = date
                    p_data = data.loc[(valid, p_date), self.select_sensors].to_numpy()
                    if normalise:
                        p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                        # p_data = normalized(np.array(p_data))
                    act_data.append(p_data)
                    phy_data.append(self.get_data(self.physiological, p_ids[idx], p_date, 'physiological'))
                    env_data.append(self.get_data(self.environmental, p_ids[idx], p_date, 'environmental'))
                    labels.append(int(valid) if valid else -1)
                    patient.append(p_ids[idx])
                    labelled_dates.append(date)
                    for i in range(1, self.max_days + 1):
                        for symbol in [-1, 1]:
                            f_date = p_date - datetime.timedelta(i) * symbol
                            try:
                                p_data = self.activity.loc[(p_ids[idx], f_date), self.select_sensors].to_numpy()
                                if normalise:
                                    p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                                    # p_data = normalized(np.array(p_data), axis=-1).reshape(3, 8, -1)
                                act_data.append(p_data)
                                phy_data.append(self.get_data(self.physiological, p_ids[idx], f_date, 'physiological'))
                                env_data.append(self.get_data(self.environmental, p_ids[idx], f_date, 'environmental'))
                                labels.append(self.laplace_smooth(i) * symbol)
                                patient.append(p_ids[idx])
                            except KeyError:
                                break
                    activity_data.append(act_data)
                    uti_labels.append(labels)
                    patient_ids.append(patient)
                    physiological_data.append(phy_data)
                    environmental_data.append(env_data)

        activity_data = np.array(activity_data)
        uti_labels = np.array(uti_labels)
        patient_ids = np.array(patient_ids)
        physiological_data = np.array(physiological_data)
        environmental_data = np.array(environmental_data)
        labelled_dates = np.array(labelled_dates)

        return activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates

    def get_unlabelled_data(self, normalise=False, date='2021-03-01'):
        '''
        Get the unlabelled data,
        Parameters
        ----------
        normalise: bool, normalise the data or not
        date: str, only return the data later than the date provided. By default,
            it will not return the tihm unlabelled

        Returns activity, physiological, environmental data, patient ids, dates
        -------

        '''
        # May need to change the for loop to dataframe operations
        # df = self.activity.reset_index().set_index(['id', 'Date'])
        # phy_df = self.physiological.reset_index()
        # phy_df = phy_df.pivot_table(index=['id', 'time'], columns='location',
        #                             values='value').reset_index().rename(columns={'time': 'Date'})
        # indices = df.reset_index()[['id', 'Date']].drop_duplicates()
        # get p ids
        df = self.activity.reset_index().set_index(['id', 'Date'])
        if date is not None:
            df = df[df.index.get_level_values(1) > date]

        p_ids = df.index.get_level_values(0).unique()
        outputs = []
        phy_data, env_data = [], []
        outputs_p_ids = []
        outputs_dates = []
        for idx in range(len(p_ids)):
            # get data of patient
            data = df.loc[p_ids[idx]]
            dates = data.index.get_level_values(0).unique()
            for date in dates:
                # validated data
                p_data = data.loc[date, self.select_sensors].to_numpy()
                if normalise:
                    # p_data = torch.Tensor(np.array(p_data))
                    # p_data = F.normalize(p_data, p=2, dim=-1)
                    p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                    # p_data = np.array(p_data)
                    # p_data = normalize(p_data, axis=2)
                outputs.append(p_data)
                phy_data.append(self.get_data(self.physiological, p_ids[idx], date, 'physiological'))
                env_data.append(self.get_data(self.environmental, p_ids[idx], date, 'environmental'))
                outputs_p_ids.append(p_ids[idx])
                outputs_dates.append(date)
        return np.array(outputs), np.array(phy_data), np.array(env_data), \
               np.array(outputs_p_ids), None, np.array(outputs_dates)

    @staticmethod
    def laplace_smooth(i, lam=3, denominator=1):
        return np.exp(- np.abs(i) / lam) / denominator

    @staticmethod
    def get_data(df, p_id, date, datatype):
        if df is None:
            return
        try:
            return df.loc[(p_id, date, config[datatype]['sensors'])] \
                .sort_values('location', key=lambda x: x.map(config[datatype]['sort_dict']))['value'].to_numpy()
        except KeyError:
            return [0.] * len(config[datatype]['sensors'])

Static methods

def get_data(df, p_id, date, datatype)
Expand source code
@staticmethod
def get_data(df, p_id, date, datatype):
    if df is None:
        return
    try:
        return df.loc[(p_id, date, config[datatype]['sensors'])] \
            .sort_values('location', key=lambda x: x.map(config[datatype]['sort_dict']))['value'].to_numpy()
    except KeyError:
        return [0.] * len(config[datatype]['sensors'])
def laplace_smooth(i, lam=3, denominator=1)
Expand source code
@staticmethod
def laplace_smooth(i, lam=3, denominator=1):
    return np.exp(- np.abs(i) / lam) / denominator

Instance variables

var labelled_data
Expand source code
@property
@load_save(**config['labelled_data']['save'])
def labelled_data(self):
    activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates = \
        self.get_labelled_data(normalise=False)
    return {
        'activity': activity_data,
        'phy': physiological_data,
        'env': environmental_data,
        'p_ids': patient_ids,
        'uti_labels': uti_labels,
        'dates': labelled_dates
    }
var unlabelled_data
Expand source code
@property
@load_save(**config['unlabelled_data']['save'])
def unlabelled_data(self):
    activity_data, physiological_data, environmental_data, patient_ids, dates = \
        self.get_unlabelled_data(normalise=False)
    return {
        'activity': activity_data,
        'phy': physiological_data,
        'env': environmental_data,
        'p_ids': patient_ids,
        'dates': dates
    }

Methods

def get_labelled_data(self, normalise=False)
Expand source code
def get_labelled_data(self, normalise=False):
    # get p ids
    p_ids = self.labelled_df.index.get_level_values(0).unique()
    activity_data, uti_labels, patient_ids, physiological_data, environmental_data, labelled_dates = [], [], [], [], [], []
    for idx in range(len(p_ids)):
        # get data of patient
        data = self.labelled_df.loc[p_ids[idx]]
        for valid in data.index.get_level_values(0).unique():
            dates = data.loc[valid].index.get_level_values(0).unique()
            for date in dates:
                # validated data
                act_data, labels, patient, phy_data, env_data = [], [], [], [], []
                p_date = date
                p_data = data.loc[(valid, p_date), self.select_sensors].to_numpy()
                if normalise:
                    p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                    # p_data = normalized(np.array(p_data))
                act_data.append(p_data)
                phy_data.append(self.get_data(self.physiological, p_ids[idx], p_date, 'physiological'))
                env_data.append(self.get_data(self.environmental, p_ids[idx], p_date, 'environmental'))
                labels.append(int(valid) if valid else -1)
                patient.append(p_ids[idx])
                labelled_dates.append(date)
                for i in range(1, self.max_days + 1):
                    for symbol in [-1, 1]:
                        f_date = p_date - datetime.timedelta(i) * symbol
                        try:
                            p_data = self.activity.loc[(p_ids[idx], f_date), self.select_sensors].to_numpy()
                            if normalise:
                                p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                                # p_data = normalized(np.array(p_data), axis=-1).reshape(3, 8, -1)
                            act_data.append(p_data)
                            phy_data.append(self.get_data(self.physiological, p_ids[idx], f_date, 'physiological'))
                            env_data.append(self.get_data(self.environmental, p_ids[idx], f_date, 'environmental'))
                            labels.append(self.laplace_smooth(i) * symbol)
                            patient.append(p_ids[idx])
                        except KeyError:
                            break
                activity_data.append(act_data)
                uti_labels.append(labels)
                patient_ids.append(patient)
                physiological_data.append(phy_data)
                environmental_data.append(env_data)

    activity_data = np.array(activity_data)
    uti_labels = np.array(uti_labels)
    patient_ids = np.array(patient_ids)
    physiological_data = np.array(physiological_data)
    environmental_data = np.array(environmental_data)
    labelled_dates = np.array(labelled_dates)

    return activity_data, physiological_data, environmental_data, patient_ids, uti_labels, labelled_dates
def get_unlabelled_data(self, normalise=False, date='2021-03-01')

Get the unlabelled data, Parameters


normalise : bool, normalise the data or not
 
date : str, only return the data later than the date provided. By default,
it will not return the tihm unlabelled

Returns activity, physiological, environmental data, patient ids, dates

Expand source code
def get_unlabelled_data(self, normalise=False, date='2021-03-01'):
    '''
    Get the unlabelled data,
    Parameters
    ----------
    normalise: bool, normalise the data or not
    date: str, only return the data later than the date provided. By default,
        it will not return the tihm unlabelled

    Returns activity, physiological, environmental data, patient ids, dates
    -------

    '''
    # May need to change the for loop to dataframe operations
    # df = self.activity.reset_index().set_index(['id', 'Date'])
    # phy_df = self.physiological.reset_index()
    # phy_df = phy_df.pivot_table(index=['id', 'time'], columns='location',
    #                             values='value').reset_index().rename(columns={'time': 'Date'})
    # indices = df.reset_index()[['id', 'Date']].drop_duplicates()
    # get p ids
    df = self.activity.reset_index().set_index(['id', 'Date'])
    if date is not None:
        df = df[df.index.get_level_values(1) > date]

    p_ids = df.index.get_level_values(0).unique()
    outputs = []
    phy_data, env_data = [], []
    outputs_p_ids = []
    outputs_dates = []
    for idx in range(len(p_ids)):
        # get data of patient
        data = df.loc[p_ids[idx]]
        dates = data.index.get_level_values(0).unique()
        for date in dates:
            # validated data
            p_data = data.loc[date, self.select_sensors].to_numpy()
            if normalise:
                # p_data = torch.Tensor(np.array(p_data))
                # p_data = F.normalize(p_data, p=2, dim=-1)
                p_data = normalized(np.array(p_data)).reshape(3, 8, -1)
                # p_data = np.array(p_data)
                # p_data = normalize(p_data, axis=2)
            outputs.append(p_data)
            phy_data.append(self.get_data(self.physiological, p_ids[idx], date, 'physiological'))
            env_data.append(self.get_data(self.environmental, p_ids[idx], date, 'environmental'))
            outputs_p_ids.append(p_ids[idx])
            outputs_dates.append(date)
    return np.array(outputs), np.array(phy_data), np.array(env_data), \
           np.array(outputs_p_ids), None, np.array(outputs_dates)