diff --git a/howso/custom_codes.amlg b/howso/custom_codes.amlg index cb7f8b23..53d210fb 100644 --- a/howso/custom_codes.amlg +++ b/howso/custom_codes.amlg @@ -12,6 +12,7 @@ features (list) derived_features (list) feature_attribute_map (null) + new_case_ids (list) ) (declare (assoc @@ -80,6 +81,54 @@ (assign (assoc ts_series_length_limit (* 2.718281828459 (size series_case_ids)) )) ) + (if has_reduced + (seq + (declare (assoc + num_cases_prev_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + (query_max ".series_index" 1 .true) + )) + ".series_index" + ) + ) + num_cases_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_equals ".reverse_series_index" 0 .true) + )) + ".series_index" + ) + ) + )) + + + (declare (assoc + is_missing_cases + ;if the number of old series cases is less + ;than indicated by the largest series index of an old series case + (and + ;this value will be null if there are no previous cases trained before this derivation + num_cases_prev_trained + (!= + (size (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + )) + num_cases_prev_trained + ) + ) + )) + ) + ) + ;series_data is all the necessary_features's values along with the training index and case_id appended as the last two columns ;when the data is sorted, ties will be broken by the training index to assist with consistency (declare (assoc @@ -100,15 +149,34 @@ (if (size series_ordered_by_features) (assign (assoc series_data - (call !MultiSortList (assoc - data series_data - ;specify indices for series_ordered_by_features and the index of !internalLabelSessionTrainingIndex - column_order_indices - (append - (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) - (size necessary_features) + ;cases have been removed from the series at some point, we must sort by series index, leaving nulls where appropriate + (if is_missing_cases + (let + (assoc + series_index_to_data_map + (zip + (map (lambda (retrieve_from_entity (current_value) ".series_index")) series_case_ids) + series_data + ) ) - )) + + (unzip + series_index_to_data_map + (range 0 (- num_cases_trained 1) 1) + ) + ) + + ;sort by sorting features + (call !MultiSortList (assoc + data series_data + ;specify indices for series_ordered_by_features and the index of !internalLabelSessionTrainingIndex + column_order_indices + (append + (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) + (size necessary_features) + ) + )) + ) )) ) @@ -121,6 +189,21 @@ ) )) + (if is_missing_cases + (seq + (declare (assoc + case_id_to_idx_map + (zip + series_case_ids + (indices series_case_ids) + ) + )) + (declare (assoc + new_series_case_ids (indices (keep case_id_to_idx_map new_case_ids)) + )) + ) + ) + (assign (assoc series_data (map @@ -159,31 +242,52 @@ ) ) )) - (zip - series_case_ids - - ;if this is a series stationary feature, set all values for this feature to the 'current' (last case) value - (if is_stationary_feature - ;all values are same, store as-is - (if (apply "=" (apply "append" series_data)) - series_data + (if if_missing_cases + (map + (lambda + (if is_stationary_feature + ;all values are same, store as-is + (first (last series_data)) - ;else set the entire series to the value of the 'current' case - (let - (assoc stationary_value (first (last series_data))) - (map (lambda [stationary_value]) series_data) + ;else get the row, given the index for this new case + (get series_data (current_value)) ) + ) + (keep case_id_to_idx_map new_series_case_ids) + ) - ;else non series stationary data, store as-is - series_data + + ;can do for all cases + (zip + series_case_ids + + ;if this is a series stationary feature, set all values for this feature to the 'current' (last case) value + (if is_stationary_feature + ;all values are same, store as-is + (if (apply "=" (apply "append" series_data)) + series_data + + ;else set the entire series to the value of the 'current' case + (let + (assoc stationary_value (first (last series_data))) + (map (lambda [stationary_value]) series_data) + ) + ) + + ;else non series stationary data, store as-is + series_data + ) ) ) ) )) ;generates a list of queries for each unique series id (where each series id may be a conjuction of several features) - (call !GenerateUniqueSeriesQueries (assoc series_id_features series_id_features )) + (call !GenerateUniqueSeriesQueries (assoc + series_id_features series_id_features + case_ids new_case_ids + )) ) ;list of derived_features that were inactive @@ -243,6 +347,7 @@ time_feature_delta (null) series_ordered_by_features (null) series_id_features (null) + new_case_ids (list) ) (declare (assoc @@ -265,6 +370,54 @@ ;(current_value) is in the format of (list (query_equals "series_feature_name" value) ... ) for all series_feature_name (assoc series_case_ids (contained_entities (current_value 1)) ) + (if has_reduced + (seq + (declare (assoc + num_cases_prev_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + (query_max ".series_index" 1 .true) + )) + ".series_index" + ) + ) + num_cases_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_equals ".reverse_series_index" 0 .true) + )) + ".series_index" + ) + ) + )) + + + (declare (assoc + is_missing_cases + ;if the number of old series cases is less + ;than indicated by the largest series index of an old series case + (and + ;this value will be null if there are no previous cases trained before this derivation + num_cases_prev_trained + (!= + (size (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + )) + num_cases_prev_trained + ) + ) + )) + ) + ) + ;series_data is all the necessary_features's values along with the training index and case_id appended as the last two columns ;when the data is sorted, ties will be broken by the training index to assist with consistency (declare (assoc @@ -286,11 +439,30 @@ (if (size series_ordered_by_features) (assign (assoc series_data - (call !MultiSortList (assoc - data series_data - column_order_indices - (unzip necessary_feature_index_map series_ordered_by_features) - )) + (if is_missing_cases + ;cases have been removed from the series at some point, we must sort by series index, leaving nulls where appropriate + (let + (assoc + series_index_to_data_map + (zip + (map (lambda (retrieve_from_entity (current_value) ".series_index")) series_case_ids) + series_data + ) + ) + + (unzip + series_index_to_data_map + (range 0 (- num_cases_trained 1) 1) + ) + ) + + ;just sort by the sorting features + (call !MultiSortList (assoc + data series_data + column_order_indices + (unzip necessary_feature_index_map series_ordered_by_features) + )) + ) )) ) @@ -371,6 +543,12 @@ ) )) + (if (and has_reduced is_missing_cases) + (declare (assoc + new_series_case_id_set (keep (zip series_case_ids) new_case_ids) + )) + ) + (map (lambda (let (assoc @@ -382,6 +560,12 @@ ) ) + (if (and has_reduced is_missing_cases (not (contains_index new_series_case_id_set case_id)) ) + ;if the Trainee has reduced this series previously, should only update values for new cases. + ;so skip cases that are not new + (conclude) + ) + (if (contains_label case_id !tsSynchronousCounterFeature) (assign_to_entities case_id @@ -405,7 +589,11 @@ )) ;generates a list of queries for each unique series id (where each series id may be a conjuction of several features) - (call !GenerateUniqueSeriesQueries (assoc series_id_features series_id_features )) + ;NOTE: this will only return queries for series that have had new cases trained. + (call !GenerateUniqueSeriesQueries (assoc + series_id_features series_id_features + case_ids new_case_ids + )) ) diff --git a/howso/derive_features.amlg b/howso/derive_features.amlg index 0b87656d..aedc7501 100644 --- a/howso/derive_features.amlg +++ b/howso/derive_features.amlg @@ -52,40 +52,53 @@ derived_features ) time_feature_delta (concat "." !tsTimeFeature "_delta_1") - )) - ;derive all the lag features in one pass - (call !DeriveLagFeatures (assoc - features features - lag_features lag_features + has_reduced (size (contained_entities (query_not_equals !autoAblationWeightFeature 1.0))) )) + (if !tsTimeFeature + (seq + ;create series index features + (call !CreateSeriesIndexFeatures (assoc + series_id_features (get !tsFeaturesMap "series_id_features") + new_case_ids case_ids + )) + + ;derive all the lag features in one pass + (call !DeriveLagFeatures (assoc + features features + lag_features lag_features + new_case_ids case_ids + )) + ) + ) + ;derive all the non-lag features below: (assign (assoc derived_features (filter (lambda (not (contains_value lag_features (current_value)))) derived_features) )) - ;create series index features - (call !CreateSeriesIndexFeatures (assoc - series_id_features (get !tsFeaturesMap "series_id_features") - )) ;derive time feature delta before any of the other derived features (if (contains_value derived_features time_feature_delta) (call !CreateCustomFeaturesFromCode (assoc features features derived_features [time_feature_delta] + new_case_ids case_ids )) ) - ;In datasets with no synchonous cases, this method will early-out and store - ;zeros for all cases' synchronous counters (making it a inactive feature) - (call !CreateSynchronousCounterFeature (assoc - time_feature !tsTimeFeature - time_feature_delta time_feature_delta - series_ordered_by_features [!tsTimeFeature !internalLabelSessionTrainingIndex] - series_id_features (get !tsFeaturesMap "series_id_features") - )) + (if !tsTimeFeature + ;In datasets with no synchonous cases, this method will early-out and store + ;zeros for all cases' synchronous counters (making it a inactive feature) + (call !CreateSynchronousCounterFeature (assoc + time_feature !tsTimeFeature + time_feature_delta time_feature_delta + series_ordered_by_features [!tsTimeFeature !internalLabelSessionTrainingIndex] + series_id_features (get !tsFeaturesMap "series_id_features") + new_case_ids case_ids + )) + ) ;keep only features that have a custom derivation specified and aren't time_feature_delta which was derived above (assign (assoc @@ -422,6 +435,7 @@ (assoc features (list) lag_features (list) + new_case_ids (list) ) (if (= 0 (size lag_features)) @@ -464,6 +478,54 @@ (assign (assoc ts_series_length_limit (* 2.718281828459 (size series_case_ids)) )) ) + (if has_reduced + (seq + (declare (assoc + num_cases_prev_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + (query_max ".series_index" 1 .true) + )) + ".series_index" + ) + ) + num_cases_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list series_case_ids) + (query_equals ".reverse_series_index" 0 .true) + )) + ".series_index" + ) + ) + )) + + + (declare (assoc + is_missing_cases + ;if the number of old series cases is less + ;than indicated by the largest series index of an old series case + (and + ;this value will be null if there are no previous cases trained before this derivation + num_cases_prev_trained + (!= + (size (contained_entities + (query_in_entity_list series_case_ids) + (query_not_in_entity_list new_case_ids) + )) + num_cases_prev_trained + ) + ) + )) + ) + ) + (declare (assoc ;series_data is all the necessary_features's values along with the case_id appended as the last column series_data @@ -477,10 +539,29 @@ (if (size series_ordered_by_features) (assign (assoc series_data - (call !MultiSortList (assoc - data series_data - column_order_indices (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) - )) + (if is_missing_cases + ;cases have been removed from the series at some point, we must sort by series index, leaving nulls where appropriate + (let + (assoc + series_index_to_data_map + (zip + (map (lambda (retrieve_from_entity (current_value) ".series_index")) series_case_ids) + series_data + ) + ) + + (unzip + series_index_to_data_map + (range 0 (- num_cases_trained 1) 1) + ) + ) + + ;just sort by the sorting features + (call !MultiSortList (assoc + data series_data + column_order_indices (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) + )) + ) )) ) @@ -493,6 +574,21 @@ ) )) + (if is_missing_cases + (seq + (declare (assoc + case_id_to_idx_map + (zip + series_case_ids + (indices series_case_ids) + ) + )) + (declare (assoc + new_series_case_ids (indices (keep case_id_to_idx_map new_case_ids)) + )) + ) + ) + ;remove the case_id column from series_data (assign (assoc series_data @@ -532,13 +628,27 @@ ) ) )) - (zip series_case_ids series_data) + (if is_missing_cases + ;only update the values for the new cases + (map + (lambda (get series_data (current_value)) ) + (keep case_id_to_idx_map new_series_case_ids) + ) + + ;can do all series cases + (zip series_case_ids series_data) + ) ) )) ;generates a list of queries for each unique series id (where each series id may be a conjuction of several features) - (call !GenerateUniqueSeriesQueries (assoc series_id_features series_id_features )) + ;NOTE: this will only return queries for series that have had new cases trained. + (call !GenerateUniqueSeriesQueries (assoc + series_id_features series_id_features + case_ids new_case_ids + )) ) + ;if ts_series_length_limit has been been updated to a larger value in the loop above, update the dataset with this new value (if (> ts_series_length_limit !tsSeriesLimitLength) (assign_to_entities (assoc !tsSeriesLimitLength ts_series_length_limit )) diff --git a/howso/generate_features.amlg b/howso/generate_features.amlg index 4dcda368..07161c03 100644 --- a/howso/generate_features.amlg +++ b/howso/generate_features.amlg @@ -7,18 +7,20 @@ ; series_id_features: list of feature names that specify the series id for which to derive this feature. ; If more than one specified, a unique 'series id' is then the conjuction of the specified ids. ; E.g., if 'sender' and 'reciever' are specified, a 'series id' is then each unique pair of sender-reciever. + ; new_case_ids: the list of new case entity ids ; time_feature: name of series id's time feature which to use to determine the start or end flag #!CreateSeriesIndexFeatures (declare (assoc series_id_features (list) + new_case_ids (list) ;not parameters series_time_feature !tsTimeFeature is_universal !tsTimeFeatureUniversal ) - ;list of assocs of cases for each series, where each assoc of cases is : case id -> time_feature -> value + ;list of assocs of cases for each series that has had new cases trained, where each assoc of cases is : case id -> time_feature -> value (declare (assoc series_cases_groups (map @@ -32,7 +34,11 @@ ) ;generates a list of queries for each unique series id (where each series id may be a conjuction of several features) - (call !GenerateUniqueSeriesQueries (assoc series_id_features series_id_features )) + ;NOTE: this only gets the unique series queries for series with new cases trained + (call !GenerateUniqueSeriesQueries (assoc + series_id_features series_id_features + case_id new_case_ids + )) ) )) @@ -63,48 +69,163 @@ series_size (size (current_value 1)) ) - ;convert assoc of case_id > time_feature -> value to assoc of case_id -> value - (assign (assoc - series_cases_map (map (lambda (first (values (current_value)))) series_cases_map) - )) + ;if the trainee has reduced or ablated, series must be checked to see if any cases were removed. + ;this can be done by comparing the series_index of the last derived case to the number of derived cases + (if has_reduced + (seq + (declare (assoc + previously_derived_cases (contained_entities (query_in_entity_list (indices series_cases_map)) (query_exists ".series_index")) + )) + + (if (size previously_derived_cases) + (declare (assoc + num_cases_prev_trained + (+ + 1 + (retrieve_from_entity + (first (contained_entities + (query_in_entity_list previously_derived_cases) + (query_equals ".reverse_series_index" 0) + )) + ".series_index" + ) + ) + )) + ) + ) + ) - ;sort case ids in ascending datetime order (smallest to largest time value) - (assign (assoc - sorted_case_ids - (sort - (lambda - (> - (get series_cases_map (current_value)) - (get series_cases_map (current_value 1)) + ;if this Trainee has gone through any reduction and the series index of the last case + ;doesn't correctly indicate the amount of cases still present for that series, + ;then some timesteps within the series must have been removed. + (if (and + has_reduced + (size previously_derived_cases) + (!= + (size previously_derived_cases) + num_cases_prev_trained + ) + ) + ;special path for when some series cases have been ablated/reduced + (seq + ;convert assoc of case_id > time_feature -> value to assoc of case_id -> value + ;but only do this for the newly trained cases of the series + (assign (assoc + series_cases_map + (map (lambda (first (values (current_value)))) (remove series_cases_map previously_derived_cases)) + )) + + ;actual series_size is the amount of previously derived cases + number of new cases + (assign (assoc + series_size (+ num_cases_prev_trained (size series_cases_map)) + )) + + ;sort the new case ids in ascending datetime order (smallest to largest time value) + (assign (assoc + sorted_case_ids + (sort + (lambda + (> + (get series_cases_map (current_value)) + (get series_cases_map (current_value 1)) + ) + ) + (indices series_cases_map) + ) + )) + + (declare (assoc time_to_horizon (- max_time_value (get series_cases_map (last sorted_case_ids)) ) )) + + ;create an assoc of case_id -> assoc of feature name to values for the indices and time-to-horizon if necessary + (append + ;new cases need both indices and time-to-horizon + (zip + sorted_case_ids + (map + (lambda + (if is_universal + (assoc + ".series_index" (current_value 1) + ".reverse_series_index" (- series_size (current_value 1) 1) + ".time_to_horizon" time_to_horizon + ) + + (assoc + ".series_index" (current_value 1) + ".reverse_series_index" (- series_size (current_value 1) 1) + ) + ) + ) + (range num_cases_prev_trained (+ -1 num_cases_prev_trained (size sorted_case_ids)) 1) + ) + ) + ;old cases need their reverse index and time-to-horizon updated + (zip + previously_derived_cases + (map + (lambda + (if is_universal + (assoc + ".reverse_series_index" (- series_size (retrieve_from_entity (current_value 1) ".series_index") 1) + ".time_to_horizon" time_to_horizon + ) + + (assoc + ".reverse_series_index" (- series_size (retrieve_from_entity (current_value 1) ".series_index") 1) + ) + ) + ) + previously_derived_cases ) ) - (indices series_cases_map) ) - )) + ) + + ;all series cases are present + (seq + ;convert assoc of case_id > time_feature -> value to assoc of case_id -> value + (assign (assoc + series_cases_map (map (lambda (first (values (current_value)))) series_cases_map) + )) - (declare (assoc sorted_time_values (unzip series_cases_map sorted_case_ids) )) - (declare (assoc time_to_horizon (- max_time_value (last sorted_time_values)) )) - - ;create an assoc of case_id -> [ index, reverse-index, time-to-horizon (if universal time feature) ] - (zip - sorted_case_ids - (map - (lambda - ;output the tuple - (if is_universal - [ - (current_index 1) ;index - (- (size sorted_case_ids) (current_index 1) 1) ;reverse index - time_to_horizon ;time-to-horizon - ] - - [ - (current_index 1) ;index - (- (size sorted_case_ids) (current_index 1) 1) ;reverse index - ] + ;sort case ids in ascending datetime order (smallest to largest time value) + (assign (assoc + sorted_case_ids + (sort + (lambda + (> + (get series_cases_map (current_value)) + (get series_cases_map (current_value 1)) + ) + ) + (indices series_cases_map) + ) + )) + + (declare (assoc time_to_horizon (- max_time_value (get series_cases_map (last sorted_case_ids)) ) )) + + ;create an assoc of case_id -> assoc of feature name to values for the indices and time-to-horizon if necessary + (zip + sorted_case_ids + (map + (lambda + ;output the tuple + (if is_universal + (associate + ".series_index" (current_index 1) + ".reverse_series_index" (- (size sorted_case_ids) (current_index 1) 1) + ".time_to_horizon" time_to_horizon + ) + + (associate + ".series_index" (current_index 1) + ".reverse_series_index" (- (size sorted_case_ids) (current_index 1) 1) + ) + ) + ) + sorted_case_ids ) ) - sorted_time_values ) ) )) @@ -117,39 +238,14 @@ ;by iterating over assoc of:case_id -> [ index, reverse-index, time-to-horizon ] (map (lambda - ;see whether the entity has the label + ;see whether the entity has the labels (if (contains_label (current_index) ".series_index") - (assign_to_entities - (current_index) - (if is_universal - (associate - ".series_index" (get (current_value 1) 0) - ".reverse_series_index" (get (current_value 1) 1) - ".time_to_horizon" (get (current_value 1) 2) - ) + (assign_to_entities (current_index) (current_value) ) - (associate - ".series_index" (get (current_value 1) 0) - ".reverse_series_index" (get (current_value 1) 1) - ) - ) - ) - - ;else need to append the label to the entity + ;else need to append the labels to the entity (accum_entity_roots (current_index) - (if is_universal - (list - (set_labels (get (current_value 1) 0) (list ".series_index")) - (set_labels (get (current_value 1) 1) (list ".reverse_series_index")) - (set_labels (get (current_value 1) 2) (list ".time_to_horizon")) - ) - - (list - (set_labels (get (current_value 1) 0) (list ".series_index")) - (set_labels (get (current_value 1) 1) (list ".reverse_series_index")) - ) - ) + (zip_labels (indices (current_value)) (values (current_value)) ) ) )) series_cases_group_map