subject plots

d9ec44fe · Raymond Chia · cf1ea305 · d9ec44fe · d9ec44fe · d9ec44fe
Commit d9ec44fe authored 6 months ago by Raymond Chia
--- a/S01cal.png
+++ b/S01cal.png
--- a/regress.sh
+++ b/regress.sh
+python regress_rr.py -s 2 -f tsfresh -d imu+bvp
+python regress_rr.py -s 3 -f tsfresh -d imu
+python regress_rr.py -s 3 -f tsfresh -d imu -ts 0
+# python regress_rr.py -s 4 -f tsfresh -d imu+bvp
+python regress_rr.py -s 5 -f tsfresh -d imu+bvp
+python regress_rr.py -s 6 -f tsfresh -d imu+bvp
+python regress_rr.py -s -1 -f tsfresh -d bvp
+python regress_rr.py -s -1 -f tsfresh -d imu
+python regress_rr.py -s -1 -f tsfresh -d imu+bvp -ts 0
+python regress_rr.py -s -1 -f tsfresh -d bvp -ts 0
+python regress_rr.py -s -1 -f tsfresh -d imu -ts 0
--- a/regress_rr.py
+++ b/regress_rr.py
@@ -31,6 +31,7 @@ from sklearn.metrics import accuracy_score
 from tsfresh.feature_extraction import extract_features
 from tsfresh.feature_extraction import settings as tsfresh_settings
+from tsfresh.feature_selection import relevance
 from tsfresh.utilities.string_manipulation import get_config_from_string
 from modules.datapipeline import get_file_list, load_and_snip, load_data, \
@@ -295,7 +296,7 @@ def load_e4_file(e4_file:str):
    dfs = {csv_file.filename: pd.read_csv(zip_file.open(csv_file.filename)
                                          ,header=None)
           for csv_file in zip_file.infolist()
-           if csv_file.filename.endswith('.csv')}
+           if csv_file.filename.endswith('.csv') and csv_file.file_size>0}
    bvp = dfs["BVP.csv"]
    # First row is the initial time of the session as unix time.
    # Second row is the sample rate in Hz
@@ -303,8 +304,10 @@ def load_e4_file(e4_file:str):
    fs = bvp.iloc[1].values[0]
    nsamples = len(bvp) - 2
-    t0_datetime = datetime.utcfromtimestamp(t0)
+    # t0_datetime = datetime.utcfromtimestamp(t0)
-    t0_local = datetime_from_utc_to_local(t0_datetime)
+    # t0_local = datetime_from_utc_to_local(t0_datetime)
+    t0_datetime = datetime.fromtimestamp(t0, tz=timezone.utc)
+    t0_local = t0_datetime.astimezone(pytz.timezone('Australia/Sydney'))
    time = [t0_local.timestamp() + ind*(1/fs) for ind in
            range(nsamples)]
    tmp = [np.nan, np.nan]
@@ -341,7 +344,7 @@ def load_e4_files(f_list:list):
    for d, h in tmp:
        data.append(d)
        hdr.append(h)
-    data_df = pd.concat(data, axis=0)
+    data_df = pd.concat(data, axis=0).sort_values(by='sec')
    return data_df, hdr
 # Synchronising data
@@ -416,7 +419,7 @@ def df_win_task(w_inds, df, i, cols):
    for col in cols:
        data = w_df[col].values
        # DSP
-        if sum(np.abs(data)) > 0:
+        if sum(np.abs(data)) > 0 and np.std(data) > 0:
            sd_data = (data - np.mean(data, axis=0))/np.std(data, axis=0)
        else:
            sd_data = data.copy()
@@ -562,7 +565,7 @@ def load_and_sync_xsens(subject, sens_list:list=['imu', 'bvp']):
        if 'imu' in sens_list and 'bvp' in sens_list:
            br_df, imu_df = sync_to_ref(br_df, imu_df_all.copy())
            pss_df, _ = sync_to_ref(pss_df, imu_df_all.copy())
-            bvp_df, _ = sync_to_ref(bvp_df_all.copy(), pss_df.copy())
+            bvp_df, _ = sync_to_ref(bvp_df_all.copy(), imu_df_all.copy())
        elif 'imu' in sens_list and not 'bvp' in sens_list:
            br_df, imu_df = sync_to_ref(br_df, imu_df_all.copy())
            pss_df, _ = sync_to_ref(pss_df, imu_df_all.copy())
@@ -615,6 +618,7 @@ def load_and_sync_xsens(subject, sens_list:list=['imu', 'bvp']):
    if len(xsens_list) > 1:
        xsens_df = pd.concat(xsens_list, axis=0, ignore_index=True)
+        xsens_df.sort_values(by='sec', inplace=True)
        xsens_df.reset_index(drop=True, inplace=True)
    else:
        xsens_df = xsens_list[0]
@@ -623,7 +627,7 @@ def load_and_sync_xsens(subject, sens_list:list=['imu', 'bvp']):
 def load_tsfresh(xsens_df, home_dir,
                 window_size=12, window_shift=0.2, fs=IMU_FS,
-                 overwrite=False, data_cols=None, prefix=None):
+                 overwrite=False, data_cols=None, prefix=None, **kwargs):
    """
    Loads the tsfresh pickle file, or generates if it does not exist for the
    given configuration
@@ -654,7 +658,6 @@ def load_tsfresh(xsens_df, home_dir,
    if exists(pkl_file) and not overwrite:
        return pd.read_pickle(pkl_file)
    assert 'acc_x' in xsens_df.columns.tolist() and \
            'gyro_x' in xsens_df.columns.tolist() and \
            'bvp' in xsens_df.columns.tolist(), \
@@ -671,6 +674,7 @@ def load_tsfresh(xsens_df, home_dir,
        x_df, column_sort='sec',
        column_id='id',
        # default_fc_parameters=tsfresh_settings.MinimalFCParameters(),
+        **kwargs
    )
    x_features_df.fillna(0, inplace=True)
    x_features_df.reset_index(drop=True, inplace=True)
@@ -755,7 +759,7 @@ def get_cal_data(event_df, xsens_df):
            start_sec = sec
            continue
        elif event == 'Stop':
-            stop_sec = sec
+            stop_sec = min(sec, xsens_df['sec'].values[-1])
            dsync = DataSynchronizer()
            dsync.set_bounds(xsens_df['sec'].values, start_sec, stop_sec)
@@ -967,7 +971,7 @@ class EvalHandler():
            if len(index_list) == 0:
                print("adding new entry")
                eval_hist = eval_hist._append(self.entry, ignore_index=True)
-            elif index_list is not None and self.overwrite:
+            elif index_list is not None:
                eval_hist.loc[index_list[0]] = self.entry
        self.eval_hist = eval_hist
@@ -1184,6 +1188,7 @@ def sens_rr_model(subject,
                  train_len:int=3,
                  test_standing=False,
                  data_input:str='imu+bvp',
+                  ntop_features:int=5
                 ):
    """Loads, preprocesses, and trains a select model using the configured
    settings.
@@ -1281,17 +1286,10 @@ def sens_rr_model(subject,
    test_df_tmp = get_test_data(cal_df, activity_df, xsens_df, test_standing)
    test_df = pd.concat([df for df in test_df_tmp['data']], axis=0)
+    y_cols = ['sec', 'br', 'pss', 'cpm']
    if use_tsfresh:
        cal_df_list = []
-        test_df = load_tsfresh(test_df,
+        top_cpm_features = {}
-                               pfh.home_directory,
-                               window_size=window_size,
-                               window_shift=window_shift,
-                               fs=fs,
-                               overwrite=overwrite_tsfresh,
-                               data_cols=data_cols,
-                               prefix='test',
-                              )
        for index, row in cal_df.iterrows():
            data = load_tsfresh(row['data'],
                                pfh.home_directory,
@@ -1302,9 +1300,47 @@ def sens_rr_model(subject,
                                data_cols=data_cols,
                                prefix=f"calcpm_{row['cpm']}"
                               )
+            m_data_cols = data.columns.values[3:]
+            rel_df = relevance.calculate_relevance_table(
+                data[m_data_cols], data['pss'])
+            params = []
+            if 'imu' in sens_list:
+                tmp = [
+                    val for val in rel_df['feature'] if 'acc' in val or 
+                    'gyr' in val
+                ]
+                params += tmp[:ntop_features]
+            if 'bvp' in sens_list:
+                tmp = [val for val in rel_df['feature'] if 'bvp' in val]
+                params += tmp[:ntop_features]
+            assert params is not None, "Invalid selection of parameters"
+            top_cpm_features[row['cpm']] = params
            cal_df_list.append({'cpm': row['cpm'], 'data': data})
+        param_df = pd.DataFrame.from_dict(top_cpm_features, orient='index')
+        top_cpm_params = param_df.columns.values
+        arr = param_df[top_cpm_params].values.flatten()
+        hist_df = pd.DataFrame.from_dict(Counter(arr), orient='index')
+        feature_strings = get_parameters_from_feature_string(
+            hist_df.index.values.tolist())
        cal_df = pd.DataFrame(cal_df_list)
+        # Get the relevance for tables cal and extract top 20
+        test_df = load_tsfresh(test_df,
+                               pfh.home_directory,
+                               window_size=window_size,
+                               window_shift=window_shift,
+                               fs=fs,
+                               overwrite=overwrite_tsfresh,
+                               data_cols=data_cols,
+                               prefix='test',
+                               kind_to_fc_parameters=feature_strings
+                              )
+        chosen_cols = [col for col in test_df.columns if col in
+                       hist_df.index.tolist()+y_cols]
+        test_df = test_df[chosen_cols]
    else:
        x_test_df, y_test_df = get_df_windows(
            test_df, df_win_task, window_size=window_size,
@@ -1374,12 +1410,12 @@ def sens_rr_model(subject,
        elif use_tsfresh:
            y_cols = ['sec', 'br', 'pss', 'cpm']
-            x_cols = [col for col in train_df.columns.values if col not in y_cols]
+            x_cols = [col for col in test_df.columns.values if col not in y_cols]
-            x_train = train_df[x_cols].values
-            y_train = train_df['cpm'].values.reshape(-1, 1)
            x_test  = test_df[x_cols].values
            y_test  = test_df[lbl_str].values.reshape(-1, 1)
            y_test_df = test_df[y_cols[:-1]]
+            x_train = train_df[x_cols].values
+            y_train = train_df['cpm'].values.reshape(-1, 1)
        else:
            x_train_df, y_train_df = get_df_windows(train_df,
                                                    df_win_task,
@@ -1555,10 +1591,10 @@ def arg_parser():
                        default=0,
                       )
    parser.add_argument('--win_size', type=int,
-                        default=12,
+                        default=20,
                       )
    parser.add_argument('--win_shift', type=float,
-                        default=0.2,
+                        default=0.05,
                       )
    parser.add_argument('-l', '--lbl_str', type=str,
                        default='pss',
@@ -1664,6 +1700,9 @@ if __name__ == '__main__':
    elif subject <= 0 and method == 'ml':
        subjects = [subject_pre_string+str(i).zfill(2) for i in \
                    range(1, N_SUBJECT_MAX+1)]
+        if 'bvp' in data_input:
+            # Missing bvp for S03, S04
+            subjects = [sbf for sbj in subjects if sbj in ['S03']]
        rr_func = partial(sens_rr_model,
                          window_size=window_size,

--- a/single_subject_cal_plot.py
+++ b/single_subject_cal_plot.py
+from datetime import datetime
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from regress_rr import load_tsfresh
+from os.path import join
+from os import makedirs
+import ipdb
+import joblib
+import pandas as pd
+from config import *
+from regress_rr import get_activity_log
+from modules.evaluations import Evaluation
+from modules.digitalsignalprocessing import (
+    movingaverage, butter_lowpass_filter)
+from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
+plt.close('all')
+plt.rcParams.update({'figure.titlesize'              : 8,
+                     'axes.titlesize'                : 7,
+                     'axes.labelsize'                : 7,
+                     'xtick.labelsize'               : 6,
+                     'ytick.labelsize'               : 6,
+                     'legend.fontsize'               : 6,
+                     'legend.title_fontsize'         : 6,
+                    })
+window_size = 20
+window_shift = 0.05
+# get the test data for the subject
+# load imu, bvp, and imu+bvp data for the subject
+# get the lin reg data for the subject
+# plot the pss vs the linear regression data for the subject
+# highlight standing periods
+sbj = 'S01'
+sens_list = ['imu', 'bvp', 'imu-bvp']
+# cfg_ids = [4, 0, 0]
+cfg_ids = [5, 11, 4]
+lbl_str = 'pss'
+mdl_dir = join(DATA_DIR, 'subject_specific', sbj)
+mdl_file = 'linr_model.joblib'
+tsfresh_dir = join(
+    mdl_dir,
+    'tsfresh__winsize_{0}__winshift_{1}'.format(window_size, window_shift)
+)
+combi_strs = ['combi7.0-10.0-12.0-15.0-17.0']*3
+def get_model(sens, sbj, cfg_id, combi):
+    m_dir = f'linreg_{sens}_rr_{sbj}_id{cfg_id}_{combi}'
+    mdl_fname = join(mdl_dir, sens+'_rr', str(cfg_id).zfill(2), m_dir, mdl_file)
+    return joblib.load(mdl_fname)
+def get_test_data():
+    test_fname = 'test__winsize_{0}__winshift_{1}__tsfresh.pkl'.format(
+        window_size, window_shift)
+    test_df = pd.read_pickle(join(tsfresh_dir, test_fname))
+    y_cols = ['sec', 'br', 'pss', 'cpm']
+    x_cols = [col for col in test_df.columns.values if col not in y_cols]
+    x_test  = test_df[x_cols]
+    y_test  = test_df[lbl_str].values.reshape(-1, 1)
+    time = test_df['sec']
+    return x_test, y_test, time
+if __name__ == '__main__':
+    activity_log = get_activity_log(sbj)
+    standing_mask = activity_log['Activity'] == 'standing'
+    standing_log = activity_log[standing_mask]
+    x_test, y_test, time = get_test_data()
+    y_out = {}
+    metrics_dict = {}
+    x_test_cols = x_test.columns
+    y_test = movingaverage(y_test, 8)
+    cm = 1/2.54
+    fig, axs = plt.subplots(figsize=(14*cm, 7.5*cm), dpi=300)
+    axs.plot(y_test, label='ground-truth', linewidth=1, c='k');
+    standing_wins = 10*60 
+    for sens, cfg_id, combi_str in zip(sens_list, cfg_ids, combi_strs):
+        model = get_model(sens, sbj, cfg_id, combi_str)
+        if sens == 'imu':
+            cols = [col for col in x_test_cols if 'acc' in col or 'gyr' in col]
+        elif sens == 'bvp':
+            cols = [col for col in x_test_cols if 'bvp' in col]
+        else:
+            cols = x_test.columns
+        poly = PolynomialFeatures(1)
+        x_test_sens = x_test[cols]
+        x_test_sens = poly.fit_transform(x_test_sens)
+        y_pred = model.predict(x_test_sens)
+        y_pred = movingaverage(y_pred, 64)
+        evals = Evaluation(y_test.flatten(), y_pred.flatten())
+        metrics_dict[sens]=evals.get_evals()
+        axs.plot(y_pred, label=sens.upper(), linewidth=1)
+        # axs.plot(np.abs(y_test - y_pred), label=sens.upper())
+    y0 = axs.get_ylim()[-1]
+    # y1 = axs[1].get_ylim()[-1]
+    fmt = "%d/%m/%Y %H:%M:%S"
+    x = np.arange(len(y_test))
+    for start_, end_ in zip(
+        standing_log.iloc[::2].iterrows(), standing_log.iloc[1::2].iterrows()
+    ):
+        start = start_[1]
+        end = end_[1]
+        start_sec = datetime.strptime(start['Timestamps'], fmt).timestamp()
+        end_sec = datetime.strptime(end['Timestamps'], fmt).timestamp()
+        mask = (time >= start_sec) & (time <= end_sec)
+        axs.fill_between(x, y0, where=mask, facecolor='k', alpha=0.2)
+        # axs[1].fill_between(x, y1, where=mask, facecolor='k', alpha=0.2)
+    diff = np.diff(time)
+    diff = np.insert(diff, 0, 0)
+    new_day_idx = diff.argmax()
+    axs.axvline(new_day_idx, c='r', alpha=0.6, linewidth=1)
+    # axs[1].axvline(new_day_idx, c='r')
+    print(metrics_dict)
+    axs.legend(ncols=4, prop={'size': 6})
+    # fig.suptitle("RR Predictions during Rest Standing and Sitting")
+    axs.set_title("Multi-Modal RR Predictions during Rest Standing and Sitting")
+    # axs[1].set_title("Absolute Error over Time")
+    axs.set_xlabel("Time (indices)")
+    axs.set_ylabel("Respiration Rate (CPM)")
+    # axs[1].set_ylabel("Abs. Error")
+    fig.savefig('S01cal.png')