Module minder_utils.formatting.standardisation
Expand source code
import pandas as pd
import numpy as np
import os
import importlib.resources as pkg_resources
from importlib.machinery import SourceFileLoader
from minder_utils.configurations import data_path, config
def standardise_activity_data(df):
df = df.drop_duplicates()
df.time = pd.to_datetime(df.time)
df.time = pd.to_datetime(df.time.dt.strftime("%Y-%m-%d %H:%M:%S"))
df_start = df[['id', 'time', 'location']].drop_duplicates()
df_end = df_start.copy()
df_start['hour'] = '00:00:00'
df_end['hour'] = '23:00:00'
df_borders = pd.concat([df_start, df_end])
df_borders['time'] = pd.to_datetime(df_borders.time.dt.strftime('%Y-%m-%d')
+ ' ' + df_borders.hour)
df_borders.drop('hour', inplace=True, axis=1)
df = df.append(df_borders, sort=False, ignore_index=True) \
.drop_duplicates(subset=['id', 'time', 'location'])
df = df.fillna(0).groupby(['id', 'location']).apply(lambda x: x.set_index('time')
.resample('H').sum()).reset_index()
table_df = df.pivot_table(index=['id', 'time'], columns='location',
values='value').reset_index()
table_df = table_df.replace(np.nan, 0)
for sensor in config['activity']['sensors']:
if sensor not in table_df.columns:
table_df[sensor] = 0
# with open(path_dir, 'r') as file_read:
# uti_folder_path = file_read.read()
# patient_data = pd.read_csv(uti_folder_path + 'UTIs-TP-TN.csv')
# patient_data = patient_data[['subject', 'datetimeCreateddf', 'valid']].dropna().drop_duplicates()
# patient_data.columns = ['id', 'time', 'valid']
# patient_data.id = map_raw_ids(patient_data.id, True)
# p_data = []
# d = validated_date()
# for idx, p_id in enumerate(d.keys()):
# for data in d[p_id]:
# p_data.append([p_id, data[0], data[1]])
# p_data = pd.DataFrame(p_data, columns=['id', 'time', 'valid'])
#
# patient_data = pd.concat([patient_data, p_data])
#
# patient_data.time = pd.to_datetime(pd.to_datetime(patient_data.time).dt.date)
# patient_data['time'] = patient_data.time.dt.strftime('%Y-%m-%d') + patient_data['id'].astype(str)
# table_df.id = map_raw_ids(table_df.id, True)
# table_df['valid'] = table_df.time.dt.strftime('%Y-%m-%d') + table_df['id'].astype(str)
# table_df['valid'] = table_df['valid'].map(
# patient_data.loc[:, ['valid', 'time']].set_index('time')['valid'].to_dict())
table_df = table_df.dropna()
return table_df
def standardise_physiological_environmental(df, date_range, shared_id=None):
if shared_id is not None:
df = df[df.id.isin(shared_id)]
df.time = pd.to_datetime(df.time)
df = df.groupby(['id', 'location']).apply(lambda x: x.set_index('time')
.resample('D').mean()).reset_index().fillna(0)
df.time = pd.to_datetime(df.time).dt.date
idx = pd.MultiIndex.from_product((df.id.unique(), date_range, df.location.unique()), names=['id', 'time', 'location'])
return df.set_index(['id', 'time', 'location']).reindex(idx, fill_value=0)\
.reset_index().pivot_table(index=['id', 'time', 'location'], values='value')
def normalized(a, axis=-1, order=2):
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
l2[l2==0] = 1
return a / np.expand_dims(l2, axis)
Functions
def normalized(a, axis=-1, order=2)
-
Expand source code
def normalized(a, axis=-1, order=2): l2 = np.atleast_1d(np.linalg.norm(a, order, axis)) l2[l2==0] = 1 return a / np.expand_dims(l2, axis)
def standardise_activity_data(df)
-
Expand source code
def standardise_activity_data(df): df = df.drop_duplicates() df.time = pd.to_datetime(df.time) df.time = pd.to_datetime(df.time.dt.strftime("%Y-%m-%d %H:%M:%S")) df_start = df[['id', 'time', 'location']].drop_duplicates() df_end = df_start.copy() df_start['hour'] = '00:00:00' df_end['hour'] = '23:00:00' df_borders = pd.concat([df_start, df_end]) df_borders['time'] = pd.to_datetime(df_borders.time.dt.strftime('%Y-%m-%d') + ' ' + df_borders.hour) df_borders.drop('hour', inplace=True, axis=1) df = df.append(df_borders, sort=False, ignore_index=True) \ .drop_duplicates(subset=['id', 'time', 'location']) df = df.fillna(0).groupby(['id', 'location']).apply(lambda x: x.set_index('time') .resample('H').sum()).reset_index() table_df = df.pivot_table(index=['id', 'time'], columns='location', values='value').reset_index() table_df = table_df.replace(np.nan, 0) for sensor in config['activity']['sensors']: if sensor not in table_df.columns: table_df[sensor] = 0 # with open(path_dir, 'r') as file_read: # uti_folder_path = file_read.read() # patient_data = pd.read_csv(uti_folder_path + 'UTIs-TP-TN.csv') # patient_data = patient_data[['subject', 'datetimeCreateddf', 'valid']].dropna().drop_duplicates() # patient_data.columns = ['id', 'time', 'valid'] # patient_data.id = map_raw_ids(patient_data.id, True) # p_data = [] # d = validated_date() # for idx, p_id in enumerate(d.keys()): # for data in d[p_id]: # p_data.append([p_id, data[0], data[1]]) # p_data = pd.DataFrame(p_data, columns=['id', 'time', 'valid']) # # patient_data = pd.concat([patient_data, p_data]) # # patient_data.time = pd.to_datetime(pd.to_datetime(patient_data.time).dt.date) # patient_data['time'] = patient_data.time.dt.strftime('%Y-%m-%d') + patient_data['id'].astype(str) # table_df.id = map_raw_ids(table_df.id, True) # table_df['valid'] = table_df.time.dt.strftime('%Y-%m-%d') + table_df['id'].astype(str) # table_df['valid'] = table_df['valid'].map( # patient_data.loc[:, ['valid', 'time']].set_index('time')['valid'].to_dict()) table_df = table_df.dropna() return table_df
def standardise_physiological_environmental(df, date_range, shared_id=None)
-
Expand source code
def standardise_physiological_environmental(df, date_range, shared_id=None): if shared_id is not None: df = df[df.id.isin(shared_id)] df.time = pd.to_datetime(df.time) df = df.groupby(['id', 'location']).apply(lambda x: x.set_index('time') .resample('D').mean()).reset_index().fillna(0) df.time = pd.to_datetime(df.time).dt.date idx = pd.MultiIndex.from_product((df.id.unique(), date_range, df.location.unique()), names=['id', 'time', 'location']) return df.set_index(['id', 'time', 'location']).reindex(idx, fill_value=0)\ .reset_index().pivot_table(index=['id', 'time', 'location'], values='value')