from itertools import repeat from multiprocessing import Pool, cpu_count from os.path import join, exists import numpy as np import pandas as pd import tensorflow as tf from sklearn.preprocessing import PolynomialFeatures, LabelEncoder from sklearn.model_selection import train_test_split from modules.datapipeline import load_and_snip from modules.digitalsignalprocessing import vectorized_slide_win as vsw from models.ardregression import ARDRegressionClass from models.knn import KNNClass from models.svm import SVMClass from models.lda import LDAClass from models.svr import SVRClass from models.logisticregression import LogisticRegressionClass from models.linearregression import LinearRegressionClass from models.neuralnet import FNN_HyperModel, LSTM_HyperModel, TunerClass,\ CNN1D_HyperModel from models.ridgeclass import RidgeClass from models.elasticnet import ElasticNetClass from models.resnet import Regressor_RESNET, Classifier_RESNET from models.xgboostclass import XGBoostClass from tsfresh.feature_selection import relevance as tsfresh_relevance from tsfresh.utilities.string_manipulation import get_config_from_string from sktime.transformations.panel.rocket import ( MiniRocket, MiniRocketMultivariate, MiniRocketMultivariateVariable, ) from config import WINDOW_SIZE, WINDOW_SHIFT, IMU_FS def reshape_array(data): shape = data.shape return data.reshape((shape[0], shape[2], shape[1])) def get_win_conds(time, lbls, wins): ''' get median condition of each window ''' le = LabelEncoder() c_enc = le.fit_transform(lbls) c_enc_win = get_windowed_data(time, c_enc, wins) c_enc_win = np.median(c_enc_win, axis=-1) return le.inverse_transform(c_enc_win.astype(int)) # Perform sliding window operation def create_windows(time, x, y, window_size=WINDOW_SIZE, window_shift=WINDOW_SHIFT, fs=IMU_FS): inds = np.arange(0, len(time)) wins = vsw(inds, len(inds), sub_window_size=window_size*fs, stride_size=window_shift*fs) x_win = get_windowed_data(time, x, wins) x_win = reshape_array(x_win) y_win = get_windowed_data(time, y, wins) # Take median of the window as label y_win = np.median(y_win, axis=-1) return x_win, y_win # Choose top n more relevant feature parameters from tsfresh library def get_top_tsfresh_params(x_train_df, y_train_df, lbl_str='br', ntop_features=5): x_train_df = x_train_df.fillna(0) rel_df = tsfresh_relevance.calculate_relevance_table( x_train_df, y_train_df[lbl_str]) params = rel_df['feature'].iloc[:ntop_features].values return params def get_data_cols(df): cols = df.columns.values data_cols = cols[5:] return data_cols def get_label_cols(df): cols = df.columns.values br_str = [f for f in cols if f.lower() == 'br'][0] lbl_cols = [br_str, 'condition'] return lbl_cols def get_conditions_from_glob(glob_pattern): if glob_pattern == '[!M]*': conditions = ['R', 'L0', 'L1', 'L2', 'L3'] elif glob_pattern == 'L*': conditions = ['L0', 'L1', 'L2', 'L3'] else: sys.exit("Unmatched glob pattern") return conditions # Returns intra subject relevant features def get_intra_feature_hist(df_list, lbl_str='br', ntop_features=5): df = df_list[0].copy() data_cols = get_data_cols(df) lbl_cols = get_label_cols(df) sbj_param_dict = {} for df in df_list: df.dropna(inplace=True) x = df[data_cols] y = df[lbl_cols] sbj = int(df['subject'].values[0]) params = get_top_tsfresh_params(x, y, lbl_str=lbl_str, ntop_features=ntop_features) sbj_param_dict[sbj] = params sbj_param_df = pd.DataFrame.from_dict(sbj_param_dict, orient='index') cols = sbj_param_df.columns.values arr = sbj_param_df[cols].values.flatten() hist_df = pd.DataFrame.from_dict(Counter(arr), orient='index') return hist_df # Returns inter subject relevant features def get_inter_feature_hist(df, lbl_str='br', ntop_features=5, nsbjs=30): data_cols = get_data_cols(df) lbl_cols = get_label_cols(df) # drop df.dropna(inplace=True) # Check for overlapping times x_time = df['ms'].values sbj_param_dict = {} x = df[data_cols] y = df[lbl_cols] params = get_top_tsfresh_params(x, y, lbl_str=lbl_str, ntop_features=ntop_features) sbj_param_dict[0] = params sbj_param_df = pd.DataFrame.from_dict(sbj_param_dict, orient='index') cols = sbj_param_df.columns.values arr = sbj_param_df[cols].values.flatten() hist_df = pd.DataFrame.from_dict(Counter(arr), orient='index') return hist_df # Perform generic model training def model_training(mdl_str, x_train, y_train, marker, validation_data=None, overwrite=False, is_regression=False, project_directory=None, window_size=50, extra_train=20, poly_deg=1): directory = join(project_directory, '_'.join([mdl_str, marker])) if validation_data is not None: x_val, y_val = validation_data[0], validation_data[1] if mdl_str not in ['fnn', 'lstm', 'cnn1d'] and validation_data is not None: x_train = np.concatenate((x_train, x_val), axis=0) y_train = np.concatenate((y_train, y_val), axis=0) if mdl_str == 'fnn': print("---FNN---") fnn_hypermodel = FNN_HyperModel() fnn_hypermodel.n_features = x_train.shape[-1] fnn_hypermodel.window_size = window_size fnn_hypermodel.batch_size = 32 if is_regression: fnn_hypermodel.n_labels = 1 fnn_hypermodel.loss_fn = tf.keras.losses.MeanAbsoluteError() else: fnn_hypermodel.n_labels = len(np.unique(y_train)) fnn_hypermodel.loss_fn = \ tf.keras.losses.SparseCategoricalCrossentropy() tuner = TunerClass(fnn_hypermodel, marker=marker, tuner_type='bayesianoptimization', overwrite=overwrite, directory=directory) if validation_data is None: tuner.search(x_train, y_train, None, validation_split=0.2) else: tuner.search(x_train, y_train, (x_val, y_val)) if overwrite or not exists(tuner.best_model_path+'.index'): hypermodel.verbose = True callbacks = tuner.get_callbacks(epochs=extra_train) fnn_mdl = tuner.load_model(is_training=True) history = fnn_hypermodel.fit( None, fnn_mdl, x_train, y_train, validation_data=validation_data, epochs=extra_train, ) tuner.save_weights_to_path() tuner.load_model(is_training=False) tuner.load_weights_from_path() fnn_mdl = tuner.tuner.hypermodel.model return None, fnn_mdl elif mdl_str == 'lstm': print("---LSTM---") lstm_hypermodel = LSTM_HyperModel() lstm_hypermodel.n_features = x_train.shape[-1] lstm_hypermodel.window_size = window_size lstm_hypermodel.batch_size = 32 if is_regression: lstm_hypermodel.n_labels = 1 lstm_hypermodel.loss_fn = tf.keras.losses.MeanAbsoluteError() else: lstm_hypermodel.n_labels = len(np.unique(y_train)) lstm_hypermodel.loss_fn = \ tf.keras.losses.SparseCategoricalCrossentropy() lstm_hypermodel.metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] print("input shape: ", (lstm_hypermodel.window_size, lstm_hypermodel.n_features)) print("x shape: ", x_train.shape) tuner = TunerClass(lstm_hypermodel, marker=marker, tuner_type='bayesianoptimization', overwrite=overwrite, directory=directory) if validation_data is None: tuner.search(x_train, y_train, None, validation_split=0.2) else: tuner.search(x_train, y_train, (x_val, y_val)) if overwrite or not exists(tuner.best_model_path+'.index'): lstm_mdl = tuner.load_model(is_training=True) lstm_hypermodel.verbose = True callbacks = tuner.get_callbacks(epochs=extra_train) history = lstm_hypermodel.fit( None, lstm_mdl, x_train, y_train, validation_data=validation_data, epochs=extra_train, callbacks=callbacks ) tuner.save_weights_to_path() tuner.load_model(is_training=False) tuner.load_weights_from_path() lstm_mdl = tuner.tuner.hypermodel.model return None, lstm_mdl elif mdl_str == 'cnn1d': print("---CNN1D---") n_features = x_train.shape[-1] hypermodel = CNN1D_HyperModel() hypermodel.n_features = n_features hypermodel.window_size = window_size hypermodel.input_shape = (window_size, n_features) hypermodel.batch_size = 32 if is_regression: hypermodel.n_labels = 1 hypermodel.loss_fn = tf.keras.losses.MeanAbsoluteError() hypermodel.metrics = None else: hypermodel.n_labels = len(np.unique(y_train)) print("input shape: ", hypermodel.input_shape) print("x shape: ", x_train.shape) tuner = TunerClass(hypermodel, marker=marker, tuner_type='bayesianoptimization', overwrite=overwrite, directory=directory) if validation_data is None: tuner.search(x_train, y_train, validation_data, validation_split=0.2) else: tuner.search(x_train, y_train, validation_data) if overwrite or not exists(tuner.best_model_path+'.index'): mdl = tuner.load_model(is_training=True) hypermodel.verbose = True callbacks = tuner.get_callbacks(epochs=extra_train) history = hypermodel.fit( None, mdl, x_train, y_train, validation_data=validation_data, epochs=extra_train, callbacks=callbacks, ) tuner.save_weights_to_path() tuner.load_model(is_training=False) tuner.load_weights_from_path() mdl = tuner.tuner.hypermodel.model return None, mdl elif mdl_str == 'xgboost': mdl_cls = XGBoostClass(marker=marker, directory=directory) if is_regression: mdl_cls.mdl_type = 'regressor' else: mdl_cls.mdl_type = 'classifier' elif mdl_str == 'knn': print("---KNN---") mdl_cls = KNNClass(marker=marker, directory=directory) mdl_cls.is_regression = is_regression if not is_regression: knn.n_neighbors = len(np.unique(y_train)) elif mdl_str == 'linreg': print("---LinearRegression---") # n to 2 if full set, 1 if M and R poly = PolynomialFeatures(poly_deg) x_train_poly = poly.fit_transform(x_train) mdl_cls = LinearRegressionClass(marker=marker, directory=directory) if overwrite: mdl_cls.build() mdl_cls.model.fit(x_train_poly, y_train) mdl_cls.save_model() else: try: mdl_cls.load_model() except: mdl_cls.build() mdl_cls.model.fit(x_train_poly, y_train) mdl_cls.save_model() return poly, mdl_cls.model elif mdl_str == 'svm': print("---SVM---") mdl_cls = SVMClass(marker=marker, directory=directory) elif mdl_str == 'svr': print("---SVR---") mdl_cls = SVRClass(marker=marker, directory=directory) elif mdl_str == 'elastic': print("---ElasticNet---") mdl_cls = ElasticNetClass(marker=marker, directory=directory) elif mdl_str == 'logreg': print("---LogisticRegression---") mdl_cls = LogisticRegressionClass(marker=marker, directory=directory) elif mdl_str == 'lda': print("---Linear Discriminant Analysis---") mdl_cls = LDAClass(marker=marker, directory=directory) elif mdl_str == 'ard': print("---ARD---") mdl_cls = ARDRegressionClass(marker=marker, directory=directory) elif mdl_str == 'ridge': print("---Ridge---") mdl_cls = RidgeClass(marker=marker, directory=directory) if overwrite: mdl = mdl_cls.build() mdl.fit(x_train, y_train) mdl_cls.model = mdl mdl_cls.save_model() else: try: mdl_cls.load_model() mdl = mdl_cls.model except: mdl = mdl_cls.build() mdl.fit(x_train, y_train) mdl_cls.model = mdl mdl_cls.save_model() return None, mdl def check_if_none(data): if data is not None: return data[0], data[1] def get_df_windows(df, func, window_size=15, window_shift=0.2, fs=IMU_FS, cols=None): time = df['sec'].values inds = np.arange(len(df)) window_shift *= window_size wins = vsw(inds, len(inds), sub_window_size=int(window_size*fs), stride_size=int(window_shift*fs)) x, y = [], [] x_df_out = pd.DataFrame() N = len(wins) i_list = [n for n in range(N)] args = zip(wins.tolist(), repeat(df, N), i_list, [cols]*N) out_data = [] # with Pool(cpu_count()) as p: # out_data = p.starmap(func, args) for i, win in enumerate(wins): out_data.append(func(win, df, i, cols)) x, y = [], [] for out in out_data: if out is not None: x.append(out[0]) y.append(out[1]) x_df_out = pd.concat(x).reset_index(drop=True) y_df_out = pd.concat(y).reset_index(drop=True) x_df_out.sort_values(by='sec', inplace=True) y_df_out.sort_values(by='sec', inplace=True) return x_df_out, y_df_out def make_windows_from_id(x_df, cols): def make_wins(df): ids = df.id.unique() wins = [] for i in ids: mask = df.id == i wins.append(df[mask][cols]) return wins x = make_wins(x_df) x_win = np.array(x) return x_win def get_parameters_from_feature_string(feature_names): kind_to_fc_parameters = {} for feature_name in feature_names: split_name = feature_name.split("__") sensor_var = split_name[0] feature_var = split_name[1] feature_cfg = get_config_from_string(split_name) if feature_cfg is not None: feature_cfg = [feature_cfg] tmp = {feature_var: feature_cfg} if sensor_var in kind_to_fc_parameters.keys(): params = kind_to_fc_parameters[sensor_var] if feature_var in params.keys(): feature_param = params[feature_var] if isinstance(feature_param, list): params[feature_var] = feature_param + feature_cfg else: params[feature_var] = [feature_param] + feature_cfg else: params[feature_var] = feature_cfg kind_to_fc_parameters[sensor_var] = params else: kind_to_fc_parameters[sensor_var] = tmp return kind_to_fc_parameters def split_timeseries_train_test_df(data_list, test_size=0.2, **kwargs): # In each of the files: get the last 20% as the test portion df_list = load_and_snip(data_list, **kwargs) train_data_df, test_data_df = [], [] func = partial(train_test_split, test_size=test_size, shuffle=False) with Pool(cpu_count()) as p: tmp = p.map(func, df_list) train_data_df, test_data_df = zip(*tmp) train_data_df = pd.concat(train_data_df, ignore_index=True) test_data_df = pd.concat(test_data_df, ignore_index=True) train_data_df.sort_values(by='ms', inplace=True) test_data_df.sort_values(by='ms', inplace=True) overlap_flag = np.isin(train_data_df.ms, test_data_df.ms).any()==False if not overlap_flag: ipdb.set_trace() assert overlap_flag, print("overlapping test and train data") return train_data_df, test_data_df def map_condition_to_tlx(df, tlx_df): inds = np.arange(len(df)) indexes = tlx_df.index.values.tolist() + ['R'] for index in indexes: mask = df['condition'].values == index df_inds = df.index[mask] if index in ['R', 'M']: df.loc[df_inds, 'tlx'] = 0 else: df.loc[df_inds, 'tlx'] = tlx_df[index] return df