diff --git a/howso.amlg b/howso.amlg index b8088939..a51702eb 100644 --- a/howso.amlg +++ b/howso.amlg @@ -83,7 +83,7 @@ #!cyclicFeaturesMap (null) #!numericNominalFeaturesMap (null) #!editDistanceFeatureTypesMap (null) - #!codeFeatureRecursiveMatchingMap (null) + #!codeFeatureDomainAttributesMap (null) #!stringNominalFeaturesSet (null) #!userSpecifiedFeatureErrorsMap (null) #!averageCaseEntropyAddition (null) @@ -467,8 +467,10 @@ ;assoc of all string continuous or any json or amalgam features for fast lookup, feature -> data_type !editDistanceFeatureTypesMap (assoc) - ;assoc of code (json/yaml/amalgam) feature -> boolean (whether they are recursive_matching or not) - !codeFeatureRecursiveMatchingMap (assoc) + ;assoc of code (json/yaml/amalgam) feature -> assoc of feature domain properties with the following keys: + ; "types_must_match", "nominal_numbers", "nominal_strings", "recursive_matching"; + ;string/string_mixable features will have only one key of "recursive_matching" defined + !codeFeatureDomainAttributesMap (assoc) ;assoc of nominal features names whose values are all uniques !uniqueNominalsSet (assoc) diff --git a/howso/attribute_maps.amlg b/howso/attribute_maps.amlg index 7d8c805e..57b064ca 100644 --- a/howso/attribute_maps.amlg +++ b/howso/attribute_maps.amlg @@ -571,20 +571,45 @@ ) ) - #!ComposeCodeFeatureRecursiveMatchingMap + ;helper method executed on any non number continuous feature, ie code features, string and string_mixable + #!ComposeCodeFeatureDomainAttributesMap (map (lambda + ;code features have an explicit feature domain assoc specified, for use in queries and opcodes (if (contains_value ["json" "yaml" "amalgam"] (current_value)) - ;parameter wasn't specified, default to .false for json/yaml and .true for amalgam - (if (= (null) (get feature_attributes [(current_index 1) "recursive_matching"])) - (= "amalgam" (current_value)) + { + "types_must_match" + ;default to true + (if (= (null) (get feature_attributes [(current_index 2) "types_must_match"])) + .true + (get feature_attributes [(current_index 2) "types_must_match"]) + ) + "nominal_numbers" + ;default to false + (if (= (null) (get feature_attributes [(current_index 2) "nominal_numbers"])) + .false + (get feature_attributes [(current_index 2) "nominal_numbers"]) + ) + "nominal_strings" + ;default to true + (if (= (null) (get feature_attributes [(current_index 2) "nominal_strings"])) + .true + (get feature_attributes [(current_index 2) "nominal_strings"]) + ) + "recursive_matching" + ;parameter wasn't specified, default to .false for json/yaml and .true for amalgam + (if (= (null) (get feature_attributes [(current_index 2) "recursive_matching"])) + (= "amalgam" (current_value 1)) - ;else use the explicitly specified value - (get feature_attributes [(current_index 1) "recursive_matching"]) - ) + ;else use the explicitly specified value + (get feature_attributes [(current_index 2) "recursive_matching"]) + ) + } - ;false for any non-code features (string or string_mixable) - .false + ;else any non-code features (string or string_mixable) set recursive_matching to false + { + "recursive_matching" .false + } ) ) code_features_map @@ -792,7 +817,7 @@ ) ;Helper method to creata map of feature -> limits, based on the type of feature it is and limits specified in boundaries map - #!ComposeFeatureLimitsMap + #!ComposeFeatureDomainAttributesMap (map (lambda (let (assoc feature (current_index 1)) @@ -808,15 +833,20 @@ ;max string length (and (= "continuous" (get (current_value) "type")) - (= "string" (get (current_value) "data_type")) + (or + (= "string" (get (current_value) "data_type")) + (= "string_mixable" (get (current_value) "data_type")) + ) ) (replace (get !featureBoundsMap (list feature "max"))) - ;max code size - (= "code" (get (current_value) "type")) - (replace (get !featureBoundsMap (list feature "max"))) + ;else "continuous": + + ;code features use an assoc of properties as their feature domain attributes/limits + (contains_value ["yaml" "json" "amalgam"] (get (current_value) "data_type")) + (get !codeFeatureDomainAttributesMap feature) - ;else "continuous", and min or max are specified, provide as a delta of max - min + ;if min or max are specified, provide as a delta of max - min (if (and (!= (null) (get !featureBoundsMap (list feature "min")) ) diff --git a/howso/attributes.amlg b/howso/attributes.amlg index a884bf59..a9677e61 100644 --- a/howso/attributes.amlg +++ b/howso/attributes.amlg @@ -55,6 +55,16 @@ ; matches without considering recursion, which will yield better and faster results if the schema of the ; semistructured data is not recursive. ; + ; 'types_must_match': boolean, defaults to true, applicable to code features (when 'data_type' is one of json/yaml/amalgam). + ; If true, only considers nodes common if their types match. + ; + ; 'nominal_numbers': boolean, defaults to false, applicable to code features (when 'data_type' is one of json/yaml/amalgam). + ; If true, will assume that all numbers will match only if identical; if false, it will compare similarity of values. + ; + ; 'nominal_strings': boolean, defaults to true, applicable to code features (when 'data_type' is one of json/yaml/amalgam). + ; If true, will assume that all strings will match only if identical; + ; if false uses string edit distance to compare similarity. + ; ; 'id_feature': boolean, Set to true only for nominal features containing nominal IDs to specify that this ; feature should be used to compute case weights for id based privacy. For time series, ; this feature will be used as the id for each time series generation. Default is false @@ -217,7 +227,7 @@ ordinal_string_to_ordinal_map (assoc) ordinal_ordinal_to_string_map (assoc) non_number_continuous_features_map (assoc) - code_feature_recursive_matching_map (assoc) + code_feature_domain_attributes_map (assoc) numeric_nominal_features_map (assoc) string_nominal_features_set (assoc) feature_rounding_map (assoc) @@ -311,8 +321,8 @@ (if (size non_number_continuous_features_map) (assign (assoc - code_feature_recursive_matching_map - (call !ComposeCodeFeatureRecursiveMatchingMap (assoc code_features_map non_number_continuous_features_map)) + code_feature_domain_attributes_map + (call !ComposeCodeFeatureDomainAttributesMap (assoc code_features_map non_number_continuous_features_map)) )) ) @@ -600,7 +610,7 @@ !ordinalStringToOrdinalMap ordinal_string_to_ordinal_map !ordinalOrdinalToStringMap ordinal_ordinal_to_string_map !editDistanceFeatureTypesMap non_number_continuous_features_map - !codeFeatureRecursiveMatchingMap code_feature_recursive_matching_map + !codeFeatureDomainAttributesMap code_feature_domain_attributes_map !numericNominalFeaturesMap numeric_nominal_features_map !stringNominalFeaturesSet string_nominal_features_set !novelSubstitionFeatureSet novel_substition_feature_set @@ -625,7 +635,7 @@ (call !SetNominalFeatures (assoc nominal_features nominals)) (call !SetCyclicFeatures (assoc feature_attributes cyclics_map)) - (declare (assoc feature_limits_map (call !ComposeFeatureLimitsMap) )) + (declare (assoc feature_limits_map (call !ComposeFeatureDomainAttributesMap) )) (declare (assoc updated_hp_map (call !UpdateHyperparametersWithFeatureDomainAttributes (assoc hp_map !hyperparameterMetadataMap)) updated_default_hp_map (call !UpdateHyperparametersWithFeatureDomainAttributes (assoc hp_map !defaultHyperparameters)) diff --git a/howso/contributions.amlg b/howso/contributions.amlg index 5a226ed3..c905b19a 100644 --- a/howso/contributions.amlg +++ b/howso/contributions.amlg @@ -58,8 +58,12 @@ feature_contributions_map (assoc) num_training_cases (call !GetNumTrainingCases) edit_distance_action_feature (contains_index !editDistanceFeatureTypesMap action_feature) - is_string_mixable (= "string_mixable" (get !editDistanceFeatureTypesMap action_feature)) - is_recursive_matching (get !codeFeatureRecursiveMatchingMap action_feature) + code_feature_attributes_map (get !codeFeatureDomainAttributesMap action_feature) + edit_distance_feature_attributes_map + (append + { "use_string_edit_distance" (= "string_mixable" (get !editDistanceFeatureTypesMap action_feature)) } + (get !codeFeatureDomainAttributesMap action_feature) + ) ;store an assoc of lag/rate/delta feature -> lag/order amount for time series flows ts_feature_lag_amount_map (if !tsTimeFeature (call !BuildTSFeatureLagAmountMap)) max_lag_index_value (null) @@ -455,17 +459,17 @@ (call !CombineCode (assoc sources feature_reactions_with weights weights_with - is_recursive_matching is_recursive_matching + code_feature_attributes_map code_feature_attributes_map )) mixed_without (call !CombineCode (assoc sources feature_reactions_without weights weights_without - is_recursive_matching is_recursive_matching + code_feature_attributes_map code_feature_attributes_map )) ) (list - (edit_distance mixed_with mixed_without is_string_mixable is_recursive_matching) + (edit_distance mixed_with mixed_without edit_distance_feature_attributes_map) (difference mixed_with mixed_without) ) )) @@ -639,8 +643,7 @@ (edit_distance (get reaction_with (list "action_values" 0)) (get reaction_without (list "action_values" 0)) - is_string_mixable - is_recursive_matching + edit_distance_feature_attributes_map ) ;TODO: 17356, deal with averaging out differences (difference diff --git a/howso/details.amlg b/howso/details.amlg index d7cc100a..5acf84d5 100644 --- a/howso/details.amlg +++ b/howso/details.amlg @@ -602,7 +602,7 @@ (call !CombineCode (assoc sources neighbor_feature_values weights weights - is_recursive_matching (get !codeFeatureRecursiveMatchingMap (current_value 1)) + code_feature_attributes_map (get !codeFeatureDomainAttributesMap (current_value 1)) )) )) diff --git a/howso/influences.amlg b/howso/influences.amlg index 27258cca..8d52d579 100644 --- a/howso/influences.amlg +++ b/howso/influences.amlg @@ -624,11 +624,16 @@ (edit_distance (get expected_values index) (get reaction_values ["action_values" index]) - (or - (= "string" (get !editDistanceFeatureTypesMap action_feature)) - (= "string_mixable" (get !editDistanceFeatureTypesMap action_feature)) + (append + { + "use_string_edit_distance" + (or + (= "string" (get !editDistanceFeatureTypesMap action_feature)) + (= "string_mixable" (get !editDistanceFeatureTypesMap action_feature)) + ) + } + (get !codeFeatureDomainAttributesMap action_feature) ) - (get !codeFeatureRecursiveMatchingMap action_feature) ) (abs (- diff --git a/howso/react_discriminative.amlg b/howso/react_discriminative.amlg index e6516e1e..c1e93a65 100644 --- a/howso/react_discriminative.amlg +++ b/howso/react_discriminative.amlg @@ -706,7 +706,7 @@ (call !CombineCode (assoc sources candidate_case_values weights candidate_case_weights - is_recursive_matching (get !codeFeatureRecursiveMatchingMap action_feature) + code_feature_attributes_map (get !codeFeatureDomainAttributesMap action_feature) )) ;divide the dot product by the total weight @@ -768,7 +768,7 @@ sources (list) weights (list) similar_mix_chance 0 - is_recursive_matching .false + code_feature_attributes_map {} ) ;compute accumed_weights by adding up the total probability mass seen so far @@ -791,6 +791,11 @@ ) weights ) + domain_attributes_map + (append + {"similar_mix_chance" similar_mix_chance} + code_feature_attributes_map + ) )) (reduce @@ -809,7 +814,7 @@ frac_b (/ (get weights (current_index 1)) prob_mass) )) - (mix (previous_result) (current_value) frac_a frac_b similar_mix_chance is_recursive_matching) + (mix (previous_result) (current_value) frac_a frac_b domain_attributes_map) )) sources ) diff --git a/howso/residuals.amlg b/howso/residuals.amlg index af79b849..0dc1ff7c 100644 --- a/howso/residuals.amlg +++ b/howso/residuals.amlg @@ -1338,11 +1338,16 @@ (edit_distance case_feature_value interpolated_value - (or - (= "string" (get !editDistanceFeatureTypesMap feature)) - (= "string_mixable" (get !editDistanceFeatureTypesMap feature)) + (append + { + "use_string_edit_distance" + (or + (= "string" (get !editDistanceFeatureTypesMap feature)) + (= "string_mixable" (get !editDistanceFeatureTypesMap feature)) + ) + } + (get !codeFeatureDomainAttributesMap feature) ) - (get !codeFeatureRecursiveMatchingMap feature) ) (abs (- case_feature_value interpolated_value)) diff --git a/howso/synthesis_utilities.amlg b/howso/synthesis_utilities.amlg index b14037e6..24976109 100644 --- a/howso/synthesis_utilities.amlg +++ b/howso/synthesis_utilities.amlg @@ -905,7 +905,6 @@ case_index 1 original_regional_feature_values_map (assoc) is_string_mixable .false - is_recursive_matching (get !codeFeatureRecursiveMatchingMap feature) ) ;explode all the strings to treat them as lists @@ -975,11 +974,21 @@ ) ) + (declare (assoc + string_feature_domain_attributes_map + (append + {"use_string_edit_distance" is_string_mixable} + (get !codeFeatureDomainAttributesMap feature) + ) + )) + ;create assoc of case id -> edit distance (declare (assoc regional_edit_distances_map (map - (lambda (edit_distance (current_value) intersected_regional_value is_string_mixable is_recursive_matching)) + (lambda + (edit_distance (current_value) intersected_regional_value string_feature_domain_attributes_map) + ) regional_feature_values_map ) )) @@ -997,7 +1006,9 @@ (assoc local_edit_distances (map - (lambda (edit_distance (current_value) intersected_local_value is_string_mixable is_recursive_matching)) + (lambda + (edit_distance (current_value) intersected_local_value string_feature_domain_attributes_map) + ) (unzip regional_feature_values_map local_case_ids) ) ) diff --git a/unit_tests/ut_h_clustering.amlg b/unit_tests/ut_h_clustering.amlg index f8b1473d..a63788d3 100644 --- a/unit_tests/ut_h_clustering.amlg +++ b/unit_tests/ut_h_clustering.amlg @@ -196,6 +196,7 @@ (commonality cluster_1_expected_indices_map cluster_1_clustered_indices_map + { "recursive_matching" .false } ) (+ 1 (max (size cluster_1_clustered_indices_map) (size cluster_1_expected_indices_map)) ) ) diff --git a/unit_tests/ut_h_edit_dist_features.amlg b/unit_tests/ut_h_edit_dist_features.amlg index c7099e78..496ce81c 100644 --- a/unit_tests/ut_h_edit_dist_features.amlg +++ b/unit_tests/ut_h_edit_dist_features.amlg @@ -18,13 +18,30 @@ (print "Set 'recursive_matching' flag correctly for each feature: ") (call assert_same (assoc - obs (call_entity "howso" "debug_label" (assoc label "!codeFeatureRecursiveMatchingMap")) + obs (call_entity "howso" "debug_label" (assoc label "!codeFeatureDomainAttributesMap")) exp { - amalgam .true - json .false - string .false - yaml .false + amalgam { + nominal_numbers .false + nominal_strings .true + ;should be true by default unlike the json and yml + recursive_matching .true + types_must_match .true + } + json { + nominal_numbers .false + nominal_strings .true + recursive_matching .false + types_must_match .true + } + ;only recursive_matching should be set + string {recursive_matching .false} + yaml { + nominal_numbers .false + nominal_strings .true + recursive_matching .false + types_must_match .true + } } )) diff --git a/version.json b/version.json index ba6391a9..d9d46c32 100644 --- a/version.json +++ b/version.json @@ -1,6 +1,6 @@ { "version": "0.0.0", "dependencies": { - "amalgam": "71.1.0" + "amalgam": "72.0.0" } }