diff --git a/howso/ablation.amlg b/howso/ablation.amlg index e8bdf90e..85a3cc73 100644 --- a/howso/ablation.amlg +++ b/howso/ablation.amlg @@ -563,7 +563,6 @@ ;Declare variables for internal use. (declare (assoc - max_influence_weight_entropy_to_keep .infinity cases (list) prev_prediction_stats_map (assoc) thresholds_enabled (or (size abs_threshold_map) (size delta_threshold_map) (size rel_threshold_map) ) @@ -576,21 +575,7 @@ weight_feature distribute_weight_feature )) ) - - (declare (assoc - hyperparam_map - (call !GetHyperparameters (assoc - context_features features - weight_feature distribute_weight_feature - )) - )) (declare (assoc - closest_k (get hyperparam_map "k") - p_parameter (get hyperparam_map "p") - dt_parameter (get hyperparam_map "dt") - feature_weights (get hyperparam_map "featureWeights") - feature_deviations (get hyperparam_map "featureDeviations") - query_feature_attributes_map (get hyperparam_map "featureDomainAttributes") num_cases (call !GetNumTrainingCases) ;reduction will stop within batch_size of reduce_max_cases, so if the gap between @@ -599,6 +584,13 @@ approximate_num_cases_to_keep (max (- reduce_max_cases batch_size) !autoAblationMinNumCases) )) + ;nothing needed to reduce since the dataset is already small enough + (if (>= approximate_num_cases_to_keep num_cases) + (conclude + (call !Return (assoc payload output)) + ) + ) + (if thresholds_enabled (assign (assoc prev_prediction_stats_map @@ -613,184 +605,377 @@ )) ) - ;pair of cases and associated sorted popularities (total normalized influence of all neighbors that referenced it) (declare (assoc - case_popularity_pair - (compute_on_contained_entities - (query_exists !internalLabelSession) - ||(query_entity_cumulative_nearest_entity_weights - closest_k - features - (null) ;all cases - p_parameter - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - feature_deviations - (null) - dt_parameter - distribute_weight_feature - (rand) - (null) ;radius - !numericalPrecision - .true - ) - ) + hyperparam_map (call !GetHyperparameters (assoc weight_feature distribute_weight_feature)) )) - ;all the cases that were not returned in the pair above have 0 popularity (no other cases reference them) (declare (assoc - zero_popularity_neighbors - (contained_entities - (query_exists !internalLabelSession) - (query_not_in_entity_list (first case_popularity_pair)) - ) + k_parameter (get hyperparam_map "k") + p_parameter (get hyperparam_map "p") + feature_weights (get hyperparam_map "featureWeights") + dt_parameter (get hyperparam_map "dt") + feature_deviations (get hyperparam_map "featureDeviations") + query_feature_attributes_map (get hyperparam_map "featureDomainAttributes") )) - ;determine the cutoff value of the popularity at which all cases with a value less than that should be removed - ;e.g., if there needs to be a quarter of cases left, this would compute the 0.75 quantile of popularity values, - ;so that those bottom 75% are removed - (declare (assoc - reduction_popularity_cutoff - (quantile - (append - (last case_popularity_pair) - (range 0 1 (size zero_popularity_neighbors) 1) - ) - ;add one percent to account for enough cases selected to match the amount needed to be removed due to rounding - ;e.g., if the quantile value was 0.75 from the example above, this bumps it up to 0.76 - (+ - (/ (- num_cases approximate_num_cases_to_keep) num_cases) - 0.01 - ) - ) - )) - ;plan to only remove cases whose popularity is less than reduction_popularity_cutoff - ;i.e., only remove the non-popular cases that aren't referenced by others as much (declare (assoc - num_removal_eligible_cases - (size (filter - (lambda (< (current_value) reduction_popularity_cutoff)) - (last case_popularity_pair) - )) + all_case_ids (call !AllCases) + done .false )) + (declare (assoc - ;case ids in order from highest to lowest popularity, lowest popularity at end of list - removable_cases - (append - ;only keep the necessary number of lowest popularity eligible cases as well as all zero popularity ones - (tail (first case_popularity_pair) num_removal_eligible_cases) - zero_popularity_neighbors + neighbors_map + ||(map + (lambda + (compute_on_contained_entities + (query_not_in_entity_list [(current_index 1)]) + (query_nearest_generalized_distance + k_parameter + features + (current_index) + p_parameter + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations + (null) + dt_parameter + distribute_weight_feature + (rand) + (null) ;radius + !numericalPrecision + ) + ) + ) + (zip all_case_ids) ) )) + ;blur weights among neighbors (declare (assoc - ;list will be sorted from highest to lowest, thus cases removed from the end of the list - end_index (- (size removable_cases) 1) - random_cases .false - num_removed_this_batch 0 + duplicates + (call !DistributeCaseInfluenceWeights (assoc + case_ids all_case_ids + redistribute_weights_map neighbors_map + has_rebalance_features .false + )) )) - ;Begin looping on data removal. The ultimate end condition is if the dataset gets too small to continue removing cases. - (while (< !autoAblationMinNumCases (call !GetNumTrainingCases)) - (assign (assoc - num_removed_this_batch (min batch_size (- (call !GetNumTrainingCases) !autoAblationMinNumCases)) + ;merge duplicates if any exist + (if (size duplicates) + (call !ReduceMergeDuplicateCases (assoc + all_duplicate_cases_map (zip duplicates) )) - (assign (assoc - cases - (if (>= end_index 0) - ;grab the cases from the end, with the smallest values - (unzip - removable_cases - (range - (max 0 (- end_index num_removed_this_batch -1)) - end_index + ) + + (declare (assoc zero_weight_cases (contained_entities (query_equals distribute_weight_feature 0)) )) + ;remove zero-weight cases + (if (size zero_weight_cases) + (call !RemoveCases (assoc + cases zero_weight_cases + ;weight has already been distributed above during the first blur stop, don't do it again + distribute_weight_feature (null) + )) + ) + + ;dataset has been modified due to removal of dupes and zero weight cases + (if (or (size zero_weight_cases) (size duplicates)) + (seq + (assign (assoc all_case_ids (call !AllCases) )) + + (assign (assoc + neighbors_map + ||(map + (lambda + (compute_on_contained_entities + (query_not_in_entity_list [(current_index 1)]) + (query_nearest_generalized_distance + k_parameter + features + (current_index) + p_parameter + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations + (null) + dt_parameter + distribute_weight_feature ;TODO: should this be taken into account? + (rand) + (null) ;radius + !numericalPrecision + ) + ) ) + (zip all_case_ids) ) + )) + ) + ) + + ;second pass + (call !DistributeCaseInfluenceWeights (assoc + case_ids all_case_ids + redistribute_weights_map neighbors_map + has_rebalance_features .false + )) + + ;mark each case as not being kept at first + (map + (lambda + (accum_entity_roots (current_value) (zip_labels + ["keeping"] [.false] + )) + ) + all_case_ids + ) - ;else select random cases - (contained_entities - (query_exists distribute_weight_feature) - (query_select num_removed_this_batch (null) (rand) ) + #!ReduceComputeNeighborSurprisals + (let + (assoc + case_neighbor_surprisal_map + (compute_on_contained_entities + ||(query_entity_distance_contributions + 1 + features + all_case_ids + p_parameter + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations + (null) + (if (= dt_parameter "surprisal_to_prob") "surprisal" dt_parameter ) + distribute_weight_feature + ;use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed") + "fixed rand seed" + (null) ;radius + !numericalPrecision ) ) + ) + + (call !StoreCaseValues (assoc + case_values_map case_neighbor_surprisal_map + label_name ".neighbor_surprisal" )) + ) - (if (>= end_index 0) - ;update end index to account for the cases about to be removed - (assign (assoc end_index (- end_index (size cases)) )) + (declare (assoc + ;map of case to its core-set surprisal (the max surprisal to any case in the coreset for all cases) + case_to_css_map (map (lambda (+ .infinity)) (zip all_case_ids)) + )) - ;else no more removable cases left, remove random cases - (assign (assoc random_cases .true)) - ) + ;experimental params + (declare (assoc + ;the amount of lowest neighbor-surprisal cases to consider for keeping + ;CAN BE NULLED + lowest_ns_cases_trunc_n (null) + + ;number of cases to select for keeping per iteration + cases_to_keep_per_iter 5 + )) + + (while (not done) + (let + (assoc + cases_to_add + (if (= (current_index 1) 0) + ;on first iteration, just take lowest DC case + (contained_entities + (query_exists !internalLabelSession) + (query_equals "keeping" .false) + (query_min ".neighbor_surprisal" 1 .true) + ) + + ;otherwise need cases with low neighbor surprisal (ns) that is far from its most similar case in current_cases_to_keep + (let + (assoc + lowest_ns_cases + (contained_entities + (query_exists !internalLabelSession) + (query_equals "keeping" .false) + (if lowest_ns_cases_trunc_n + (query_min ".neighbor_surprisal" lowest_ns_cases_trunc_n .true) + ) + ) + ) + + (declare (assoc + low_ns_case_scores + (map + (lambda + ;divide neighbor surprisal by coreset surprisal + + ;I think the LLM desc was wrong, so I flipped it to be coreset surprisal / neighbor surprisal + (/ + (get case_to_css_map (current_index)) + (retrieve_from_entity (current_index) ".neighbor_surprisal") + ) + ) + (zip lowest_ns_cases) + ) + )) + + + ;sorting low dc cases by *decreasing* "score" and return the right amount + (trunc + (if (= 1 cases_to_keep_per_iter) + (index_max low_ns_case_scores) + + (sort + (lambda + (- + (get low_ns_case_scores (current_value 1)) + (get low_ns_case_scores (current_value)) + ) + ) + lowest_ns_cases + ) + ) + cases_to_keep_per_iter + ) + ) + ) + ) + + ;mark new cases to keep + (map + (lambda + (assign_to_entities (current_value) (assoc + keeping .true + )) + ) + cases_to_add + ) + + (declare (assoc + new_case_css_map + ||(map + (lambda + ;get their min surprisal to any of the cases_to_add + (apply "min" (values + (compute_on_contained_entities + (query_in_entity_list cases_to_add) + (query_within_generalized_distance + .infinity ;distance + features + (retrieve_from_entity (current_index) features) + p_parameter + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations + (null) + (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + distribute_weight_feature + "fixed rand seed" + (null) ;radius + !numericalPrecision + ) + ) + )) + ) + ;all non-coreset cases + (zip (contained_entities (query_exists !internalLabelSession) (query_equals "keeping" .false))) + ) + )) - (if !tsTimeFeature - ;do not remove first (.series_index == 0) or last (.reverse_series_index == 0) cases for any series (assign (assoc - cases - (contained_entities - (query_in_entity_list cases) - (query_not_equals ".reverse_series_index" 0) - (query_not_equals ".series_index" 0) + case_to_css_map + ;take min of new min css and old min css + (map + (lambda + (min (current_value) (get case_to_css_map (current_index))) + ) + new_case_css_map ) )) + + (if (>= + (size (contained_entities + (query_exists !internalLabelSession) + (query_equals "keeping" .true) + )) + !autoAblationMinNumCases + ) + (assign (assoc done .true)) + + ; (call !ReduceComputeSmallestSurprisals) + ) ) + ) - (if (size cases) - (seq - (call !RemoveCases (assoc - cases cases - distribute_weight_feature distribute_weight_feature - )) - (if thresholds_enabled - (let - (assoc - batch_threshold_info (null) - new_prediction_stats_map - (get - (call !CalculateFeatureResiduals (assoc - weight_feature distribute_weight_feature - use_case_weights .true - compute_all_statistics .true - )) - "prediction_stats" - ) - ) - (assign (assoc - batch_threshold_info - (call !CheckThresholds (assoc - abs_threshold_map abs_threshold_map - delta_threshold_map delta_threshold_map - rel_threshold_map rel_threshold_map - prev_prediction_stats_map prev_prediction_stats_map - new_prediction_stats_map new_prediction_stats_map + ;END facility something algo + + (assign (assoc + ;the list of case ids to be removed + cases + (contained_entities + (query_exists !internalLabelSession) + (query_equals "keeping" .false) + ) + )) + + (if !tsTimeFeature + ;do not remove first (.series_index == 0) or last (.reverse_series_index == 0) cases for any series + (assign (assoc + cases + (contained_entities + (query_in_entity_list cases) + (query_not_equals ".reverse_series_index" 0) + (query_not_equals ".series_index" 0) + ) + )) + ) + + (if (size cases) + (seq + (call !RemoveCases (assoc + cases cases + distribute_weight_feature distribute_weight_feature + )) + + (if thresholds_enabled + (let + (assoc + batch_threshold_info (null) + new_prediction_stats_map + (get + (call !CalculateFeatureResiduals (assoc + weight_feature distribute_weight_feature + use_case_weights .true + compute_all_statistics .true )) - )) - (if (apply "or" (values batch_threshold_info)) - (seq - (accum "output" ["threshold_info"] batch_threshold_info) - (conclude) + "prediction_stats" ) - (assign (assoc - prev_prediction_stats_map new_prediction_stats_map + ) + (assign (assoc + batch_threshold_info + (call !CheckThresholds (assoc + abs_threshold_map abs_threshold_map + delta_threshold_map delta_threshold_map + rel_threshold_map rel_threshold_map + prev_prediction_stats_map prev_prediction_stats_map + new_prediction_stats_map new_prediction_stats_map )) + )) + (if (apply "or" (values batch_threshold_info)) + (seq + (accum "output" ["threshold_info"] batch_threshold_info) + (conclude) ) + (assign (assoc + prev_prediction_stats_map new_prediction_stats_map + )) ) ) ) - - ;else couldn't select any from random cases, stop - (and random_cases (< end_index 0)) - (conclude) - ) - - ;enough cases have been removed, can stop removing - (if (<= (call !GetNumTrainingCases) reduce_max_cases) - (conclude) ) ) + ;if the number of cases has been reduced by 'e' or more, auto analyze if needed (if (< (call !GetNumTrainingCases) (/ num_cases 2.718281828459)) (call !AutoAnalyzeIfNeeded (assoc @@ -869,22 +1054,6 @@ )) duplicate_neighbors_map ) - - ;recompute influence weight entropy for the remaining no-longer duplicates - (declare (assoc - cases_too_far_map - (call !ComputeAndStoreInfluenceWeightEntropies (assoc - features features - weight_feature distribute_weight_feature - use_case_weights .true - compute_all .true - specific_case_ids (indices duplicate_neighbors_map) - )) - )) - - (if (size cases_too_far_map) - (accum (assoc case_duplicate_or_far_map cases_too_far_map)) - ) ) diff --git a/howso/update_cases.amlg b/howso/update_cases.amlg index 83bcda29..07dffecc 100644 --- a/howso/update_cases.amlg +++ b/howso/update_cases.amlg @@ -829,6 +829,7 @@ case_ids (list) distribute_weight_feature ".case_weight" has_rebalance_features .false + redistribute_weights_map (null) ) (declare (assoc original_distribute_weight_feature distribute_weight_feature)) @@ -845,6 +846,7 @@ )) ;default value of 1 for the accumulate_weight_feature new_weight_label_and_value (zip_labels (list distribute_weight_feature) (list 1)) + duplicates [] )) ;ensure the weight feature isn't among the features being used to find cases for distribution @@ -856,32 +858,45 @@ (lambda (let (assoc ;case weight value that needs to be distributed among the neighbors - case_weight (or (get (current_value 1) distribute_weight_feature) 1) + case_weight (get (current_value 1) distribute_weight_feature) + ) + + ;if case_weight is undefined, default it to 1 + (if (= (null) case_weight) + (assign (assoc case_weight 1)) + + ;if case has a weight of zero, skip it + (= 0 case_weight) + (conclude [0 {}]) ) (declare (assoc ;map of case_id -> weight closest_cases_map - (compute_on_contained_entities - ;don't consider cases whose weights should be distributed, since they are all about to be removed - (query_not_in_entity_list case_ids) - (query_nearest_generalized_distance - (get hyperparam_map "k") - (replace features) - ;case id - (current_index 1) - (get hyperparam_map "p") - (get hyperparam_map "featureWeights") - !queryDistanceTypeMap - (get hyperparam_map "featureDomainAttributes") - (get hyperparam_map "featureDeviations") - (null) - (get hyperparam_map "dt") - original_distribute_weight_feature - ;use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed") - "fixed rand seed" - (null) ;radius - !numericalPrecision + (if redistribute_weights_map + (get redistribute_weights_map (current_index 1)) + + (compute_on_contained_entities + ;don't consider cases whose weights should be distributed, since they are all about to be removed + (query_not_in_entity_list case_ids) + (query_nearest_generalized_distance + (get hyperparam_map "k") + (replace features) + ;case id + (current_index 1) + (get hyperparam_map "p") + (get hyperparam_map "featureWeights") + !queryDistanceTypeMap + (get hyperparam_map "featureDomainAttributes") + (get hyperparam_map "featureDeviations") + (null) + (get hyperparam_map "dt") + original_distribute_weight_feature + ;use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed") + "fixed rand seed" + (null) ;radius + !numericalPrecision + ) ) ) )) @@ -895,6 +910,10 @@ closest_cases_map (map 1 (filter (lambda (= (current_value) .infinity)) closest_cases_map) ) )) (assign (assoc total_influence (apply "+" (values closest_cases_map)) )) + + (if redistribute_weights_map + (accum (assoc duplicates (current_index 1))) + ) ) ;all cases are equally too distant, set their influence to be same @@ -903,6 +922,15 @@ closest_cases_map (map 1 closest_cases_map) total_influence (size closest_cases_map) )) + + ;if redistributing weights and this case is a duplicate, add it to the list of duplicates + (!= (null) redistribute_weights_map) + (if (and + (= "surprisal_to_prob" (get hyperparam_map "dt")) + (contains_value closest_cases_map 1) + ) + (accum (assoc duplicates (current_index 1))) + ) ) ;output pairs of: [ case_weight, distributed weight closest_cases_map] @@ -985,6 +1013,22 @@ )) ) + ;else redistributing weights to neighbors, by setting the weight directly (not accumulating) + ;and set weight 0 if none is to be redistributed + (size redistribute_weights_map) + ||(map + (lambda + (assign_to_entities (current_index) (associate + distribute_weight_feature (+ (or (last (current_value 1)))) + )) + ) + (zip case_ids) + ;reduce all the closest cases maps into one map of individual case -> total accumulated weight + (call !ReduceAssocsAddValues (assoc + list_of_assocs (map (lambda (last (current_value))) (values distributed_cases_maps)) + )) + ) + ;else no rebalance features, distribute the corresponding portion of this case's weight based on the neighbor's influence ||(map (lambda @@ -997,17 +1041,22 @@ ) ) - ;add the weight accumulated to each case to !dataMassChangeSinceLastAnalyze to ensure that cases trained as - ; only weights (whether through auto-ablation or otherwise) contribute to the progress towards the next auto-analyze, - ; if enabled. - (accum_to_entities (assoc - !dataMassChangeSinceLastAnalyze - ;sum of all case_weight values - (apply "+" (map - (lambda (first (current_value))) - (values distributed_cases_maps) - )) - )) + (if (= (null) redistribute_weights_map) + ;add the weight accumulated to each case to !dataMassChangeSinceLastAnalyze to ensure that cases trained as + ; only weights (whether through auto-ablation or otherwise) contribute to the progress towards the next auto-analyze, + ; if enabled. + (accum_to_entities (assoc + !dataMassChangeSinceLastAnalyze + ;sum of all case_weight values + (apply "+" (map + (lambda (first (current_value))) + (values distributed_cases_maps) + )) + )) + ) + + ;output list of duplicates + duplicates ) ;Helper method to reduce a list of assocs into one assoc with all the values summed up.