diff --git a/howso/client/base.py b/howso/client/base.py index 9a16c10a..ae9cffc9 100644 --- a/howso/client/base.py +++ b/howso/client/base.py @@ -1681,6 +1681,7 @@ def react( # noqa: C901 feature_bounds_map: t.Optional[Mapping] = None, feature_pre_process_code_map: t.Optional[Mapping] = None, feature_post_process_code_map: t.Optional[Mapping] = None, + filter_fanout_values: bool = False, generate_new_cases: GenerateNewCases = "no", goal_features_map: t.Optional[Mapping] = None, initial_batch_size: t.Optional[int] = None, @@ -2202,7 +2203,10 @@ def react( # noqa: C901 resulting value will be used as part of the context for following action features. The custom code will have access to all context feature values and previously generated action feature values. - + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. generate_new_cases : {"always", "attempt", "no"}, default "no" (Optional) Whether to generate new cases. @@ -2411,6 +2415,7 @@ def react( # noqa: C901 "derived_action_features": derived_action_features, "feature_pre_process_code_map": feature_pre_process_code_map, "feature_post_process_code_map": feature_post_process_code_map, + "filter_fanout_values": filter_fanout_values, "goal_features_map": goal_features_map, "post_process_features": post_process_features, "post_process_values": post_process_values, @@ -2457,6 +2462,7 @@ def react( # noqa: C901 "derived_action_features": derived_action_features, "feature_pre_process_code_map": feature_pre_process_code_map, "feature_post_process_code_map": feature_post_process_code_map, + "filter_fanout_values": filter_fanout_values, "post_process_features": post_process_features, "post_process_values": post_process_values, "use_differential_privacy": use_differential_privacy, @@ -2764,6 +2770,7 @@ def react_series( # noqa: C901 exclude_novel_nominals_from_uniqueness_check: bool = False, feature_bounds_map: t.Optional[Mapping[str, Mapping[str, t.Any]]] = None, feature_post_process_code_map: t.Optional[Mapping] = None, + filter_fanout_values: bool = False, final_time_steps: t.Optional[list[t.Any]] = None, generate_new_cases: GenerateNewCases = "no", goal_features_map: t.Optional[Mapping] = None, @@ -2897,6 +2904,10 @@ def react_series( # noqa: C901 feature values and previously generated action feature values of the timestep being generated, as well as the feature values of all previously generated timesteps. + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. series_context_features : iterable of str, optional List of context features corresponding to ``series_context_values``. series_context_values : list of list of list of object or list of DataFrame, optional @@ -3130,6 +3141,7 @@ def react_series( # noqa: C901 "constraints": constraints, "continue_series": continue_series, "feature_post_process_code_map": feature_post_process_code_map, + "filter_fanout_values": filter_fanout_values, "final_time_steps": final_time_steps, "init_time_steps": init_time_steps, "series_stop_maps": series_stop_maps, @@ -3183,6 +3195,7 @@ def react_series( # noqa: C901 "constraints": constraints, "continue_series": continue_series, "feature_post_process_code_map": feature_post_process_code_map, + "filter_fanout_values": filter_fanout_values, "final_time_steps": final_time_steps, "init_time_steps": init_time_steps, "series_stop_maps": series_stop_maps, @@ -3689,6 +3702,7 @@ def react_aggregate( # noqa: C901 convergence_threshold: t.Optional[float] = None, features_to_derive: t.Optional[Collection[str]] = None, feature_influences_action_feature: t.Optional[str] = None, + filter_fanout_values: bool = False, forecast_window_length: t.Optional[float] = None, goal_dependent_features: t.Optional[Collection[str]] = None, goal_features_map: t.Optional[Mapping] = None, @@ -3711,7 +3725,7 @@ def react_aggregate( # noqa: C901 value_robust_contributions_features: t.Optional[Collection[str]] = None, value_robust_contributions_num_buckets: int = 30, value_robust_contributions_min_samples: int = 15, - value_robust_contributions_min_cases: int = 15, + value_robust_contributions_min_cases: int | dict[str, int] = 15, weight_feature: t.Optional[str] = None, ) -> dict[str, dict[str, t.Any]]: """ @@ -3914,6 +3928,10 @@ def react_aggregate( # noqa: C901 feature_influences_action_feature : str, optional When computing feature influences such as accuracy and prediction contributions, use this feature as the action feature. If feature influences ``details`` are selected, this feature must be provided. + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. forecast_window_length : float, optional A value specifing a length of time over which to measure the accuracy of forecasts. When specified, returned prediction statistics and full residuals will be measuring the accuracy @@ -4044,11 +4062,12 @@ def react_aggregate( # noqa: C901 The minumum number of samples required for a combination of feature values for its aggregated measure to be returned when computing the "value_robust_accuracy_contributions", "value_robust_prediction_contributions" or "value_robust_surprisal_asymmetry" details. - value_robust_contributions_min_cases: int, default 15 + value_robust_contributions_min_cases: int or map of str to int, default 15 The minimum number of unique cases for a given nominal class or continuous bucket to be used as a possible feature value when collecting all combinations of feature values in - the data to report metrics over. If unspecified, there is no filtering based on number - of unique cases. + the data to report metrics over. May be specified as a single value or a mapping of feature names to + values defining individual thresholds for each feature. If defined as a mapping, then any features without + defined thresholds will use a default value of 15. weight_feature : str, optional The name of feature whose values to use as case weights. When left unspecified uses the internally managed case weight. @@ -4107,6 +4126,7 @@ def react_aggregate( # noqa: C901 "convergence_threshold": convergence_threshold, "features_to_derive": features_to_derive, "feature_influences_action_feature": feature_influences_action_feature, + "filter_fanout_values": filter_fanout_values, "forecast_window_length": forecast_window_length, "goal_dependent_features": goal_dependent_features, "goal_features_map": goal_features_map, diff --git a/howso/client/typing.py b/howso/client/typing.py index 57052e68..0e1fb90f 100644 --- a/howso/client/typing.py +++ b/howso/client/typing.py @@ -242,7 +242,6 @@ class FeatureTimeSeries(TypedDict, total=False): the default. """ - class FeatureAttributes(TypedDict): """ Attributes for a single feature. @@ -344,6 +343,14 @@ class FeatureAttributes(TypedDict): on values based on other multi-type value features. """ + fanout_on: NotRequired[list[str]] + """ + Features whose values can be used to select other cases that have the same + duplicated value for this fan-out feature. + + Should be used when this is a fan-out feature. + """ + derived_feature_code: NotRequired[str] """ Code defining how to derive this feature's value. diff --git a/howso/engine/trainee.py b/howso/engine/trainee.py index a413d05e..3d9ed479 100644 --- a/howso/engine/trainee.py +++ b/howso/engine/trainee.py @@ -1255,6 +1255,7 @@ def react( feature_bounds_map: t.Optional[Mapping[str, Mapping[str, t.Any]]] = None, feature_pre_process_code_map: t.Optional[Mapping] = None, feature_post_process_code_map: t.Optional[Mapping] = None, + filter_fanout_values: bool = False, generate_new_cases: GenerateNewCases = "no", goal_features_map: t.Optional[Mapping] = None, initial_batch_size: t.Optional[int] = None, @@ -1723,7 +1724,10 @@ def react( resulting value will be used as part of the context for following action features. The custom code will have access to all context feature values and previously generated action feature values. - + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. generate_new_cases : {"always", "attempt", "no"}, default "no" This parameter takes in a string that may be one of the following: @@ -1868,6 +1872,7 @@ def react( feature_bounds_map=feature_bounds_map, feature_pre_process_code_map=feature_pre_process_code_map, feature_post_process_code_map=feature_post_process_code_map, + filter_fanout_values=filter_fanout_values, generate_new_cases=generate_new_cases, goal_features_map=goal_features_map, initial_batch_size=initial_batch_size, @@ -1906,6 +1911,7 @@ def react_series( exclude_novel_nominals_from_uniqueness_check: bool = False, feature_bounds_map: t.Optional[Mapping[str, Mapping[str, t.Any]]] = None, feature_post_process_code_map: t.Optional[Mapping] = None, + filter_fanout_values: bool = False, final_time_steps: t.Optional[list[t.Any]] = None, generate_new_cases: GenerateNewCases = "no", goal_features_map: t.Optional[Mapping] = None, @@ -2017,6 +2023,10 @@ def react_series( feature values and previously generated action feature values of the time-step being generated, as well as the feature values of all previously generated time-steps. + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. final_time_steps: list of object, optional The time steps at which to end synthesis. Time-series only. Time-series only. Must provide either one for all series, or @@ -2157,6 +2167,7 @@ def react_series( exclude_novel_nominals_from_uniqueness_check=exclude_novel_nominals_from_uniqueness_check, feature_bounds_map=feature_bounds_map, feature_post_process_code_map=feature_post_process_code_map, + filter_fanout_values=filter_fanout_values, final_time_steps=final_time_steps, generate_new_cases=generate_new_cases, goal_features_map=goal_features_map, @@ -3582,6 +3593,7 @@ def react_aggregate( convergence_threshold: t.Optional[float] = None, features_to_derive: t.Optional[Collection[str]] = None, feature_influences_action_feature: t.Optional[str] = None, + filter_fanout_values: bool = False, forecast_window_length: t.Optional[float] = None, goal_dependent_features: t.Optional[Collection[str]] = None, goal_features_map: t.Optional[Mapping] = None, @@ -3604,7 +3616,7 @@ def react_aggregate( value_robust_contributions_features: t.Optional[Collection[str]] = None, value_robust_contributions_num_buckets: int = 30, value_robust_contributions_min_samples: int = 15, - value_robust_contributions_min_cases: int = 15, + value_robust_contributions_min_cases: int | dict[str, int] = 15, weight_feature: t.Optional[str] = None, ) -> AggregateReaction: """ @@ -3811,6 +3823,10 @@ def react_aggregate( not providing this feature will return a matrix where each feature is used as an action feature. However, providing this feature if 'feature_robust_accuracy_contributions' is selected is still accepted, and will return just the feature influences for the selected feature. + filter_fanout_values : bool, default False + When true, predictions of features with fanned out values will be + made while holding out other cases that had the same values + duplicated. forecast_window_length : float, optional A value specifying a length of time over which to measure the accuracy of forecasts. When specified, returned prediction statistics and full residuals will be measuring the accuracy @@ -3941,11 +3957,12 @@ def react_aggregate( The minumum number of samples required for a combination of feature values for its aggregated measure to be returned when computing the "value_robust_accuracy_contributions", "value_robust_prediction_contributions" or "value_robust_surprisal_asymmetry" details. - value_robust_contributions_min_cases: int, default 15 + value_robust_contributions_min_cases: int or map of str to int, default 15 The minimum number of unique cases for a given nominal class or continuous bucket to be used as a possible feature value when collecting all combinations of feature values in - the data to report metrics over. If unspecified, there is no filtering based on number - of unique cases. + the data to report metrics over. May be specified as a single value or a mapping of feature names to + values defining individual thresholds for each feature. If defined as a mapping, then any features without + defined thresholds will use a default value of 15. weight_feature : str, optional The name of feature whose values to use as case weights. When left unspecified uses the internally managed case weight. @@ -3967,6 +3984,7 @@ def react_aggregate( convergence_threshold=convergence_threshold, features_to_derive=features_to_derive, feature_influences_action_feature=feature_influences_action_feature, + filter_fanout_values=filter_fanout_values, forecast_window_length=forecast_window_length, goal_dependent_features=goal_dependent_features, goal_features_map=goal_features_map, diff --git a/howso/utilities/feature_attributes/base.py b/howso/utilities/feature_attributes/base.py index 22f170d9..b7ed13ac 100644 --- a/howso/utilities/feature_attributes/base.py +++ b/howso/utilities/feature_attributes/base.py @@ -763,6 +763,7 @@ def _process(self, # noqa: C901 datetime_feature_formats: t.Optional[dict] = None, default_time_zone: t.Optional[str] = None, dependent_features: t.Optional[dict[str, list[str]]] = None, + fanout_feature_map: t.Optional[dict[tuple[str] | str, list[str]]] = None, id_feature_name: t.Optional[str | Iterable[str]] = None, include_extended_nominal_probabilities: t.Optional[bool] = False, include_sample: bool = False, @@ -1108,6 +1109,15 @@ def _process(self, # noqa: C901 # Validate datetimes after any user-defined features have been re-implemented self._validate_date_times() + # Configure the fanout feature attributes according to the input if given. + if fanout_feature_map: + for key_features, fanout_features in fanout_feature_map.items(): + if isinstance(key_features, str): + key_features = [key_features] + for f in fanout_features: + if f in self.attributes: + self.attributes[f]['fanout_on'] = list(key_features) + # Re-order the keys like the original dataframe ordered_attributes = {} for fname in self.data.columns: @@ -1115,7 +1125,7 @@ def _process(self, # noqa: C901 if hasattr(fname, 'name'): fname = fname.name if fname not in self.attributes.keys(): - warnings.warn(f'Feature {fname} exists in provided data but was not computed in feature attributes') + warnings.warn(f'Feature {fname} exists in provided data but was not computed in feature attributes.') continue ordered_attributes[fname] = self.attributes[fname] diff --git a/howso/utilities/feature_attributes/infer_feature_attributes.py b/howso/utilities/feature_attributes/infer_feature_attributes.py index 33806caf..73ba26a1 100644 --- a/howso/utilities/feature_attributes/infer_feature_attributes.py +++ b/howso/utilities/feature_attributes/infer_feature_attributes.py @@ -119,6 +119,11 @@ def infer_feature_attributes(data: pd.DataFrame | SQLRelationalDatastoreProtocol to 2 will synthesize the 3rd order derivative value, and then use that synthed value to derive the 2nd and 1st order. + fanout_feature_map : dict of str or tuple of str to list of str, optional + (Optional) Dict mapping "key" feature names or tuples of "key" feature names to list of "fanout" feature names. + Fanout features are features with values fanned out across multiple cases. Key features are features + whose values can be used to select groups of cases that have the same duplicated fanout values. + id_feature_name : str or list of str, default None (Optional) The name(s) of the ID feature(s). diff --git a/howso/utilities/feature_attributes/time_series.py b/howso/utilities/feature_attributes/time_series.py index 3358598d..072cf2dc 100644 --- a/howso/utilities/feature_attributes/time_series.py +++ b/howso/utilities/feature_attributes/time_series.py @@ -330,6 +330,7 @@ def _process( # noqa: C901 delta_boundaries: t.Optional[dict] = None, dependent_features: t.Optional[dict] = None, derived_orders: t.Optional[dict] = None, + fanout_feature_map: t.Optional[dict[str | tuple[str], list[str]]] = None, id_feature_name: t.Optional[str | Iterable[str]] = None, include_extended_nominal_probabilities: t.Optional[bool] = False, include_sample: bool = False, @@ -448,6 +449,11 @@ def _process( # noqa: C901 to 2 will synthesize the 3rd order derivative value, and then use that synthed value to derive the 2nd and 1st order. + fanout_feature_map : dict of str or tuple of str to list of str, optional + (Optional) Dict mapping "key" feature names or tuples of "key" feature names to list of "fanout" feature names. + Fanout features are features with values fanned out across multiple cases. Key features are features + whose values can be used to select groups of cases that have the same duplicated fanout values. + id_feature_name : str or list of str default None (Optional) The name(s) of the ID feature(s). @@ -633,6 +639,7 @@ def _process( # noqa: C901 datetime_feature_formats=datetime_feature_formats, default_time_zone=default_time_zone, dependent_features=dependent_features, + fanout_feature_map=fanout_feature_map, id_feature_name=id_feature_name, include_extended_nominal_probabilities=include_extended_nominal_probabilities, include_sample=include_sample, diff --git a/version.json b/version.json index b41e4d8f..bb025638 100644 --- a/version.json +++ b/version.json @@ -1,5 +1,5 @@ { "dependencies": { - "howso-engine": "110.3.0" + "howso-engine": "110.5.0" } }