first draft of splitting signals

dsweber2 · dsweber2 · commit f4bc65131ad0 · 2024-03-29T11:34:20.000-07:00
diff --git a/nwss_wastewater/delphi_nwss/constants.py b/nwss_wastewater/delphi_nwss/constants.py
@@ -12,6 +12,16 @@
 
 SIGNALS = ["pcr_conc_smoothed"]
 METRIC_SIGNALS = ["detect_prop_15d", "percentile", "ptc_15d"]
+PROVIDER_NORMS = {
+    "provider": ["CDC_VERILY", "CDC_VERILY", "NWSS", "NWSS", "WWS"],
+    "normalization": [
+        "flow-population",
+        "microbial",
+        "flow-population",
+        "microbial",
+        "microbial",
+    ],
+}
 METRIC_DATES = ["date_start", "date_end"]
 SAMPLE_SITE_NAMES = {
     "wwtp_jurisdiction": "category",
@@ -24,6 +34,6 @@
     "sampling_prior": bool,
     "sample_location_specify": float,
 }
-SIG_DIGITS = 7
+SIG_DIGITS = 4
 
 NEWLINE = "\n"
diff --git a/nwss_wastewater/delphi_nwss/pull.py b/nwss_wastewater/delphi_nwss/pull.py
@@ -7,6 +7,7 @@
 
 from .constants import (
     SIGNALS,
+    PROVIDER_NORMS,
     METRIC_SIGNALS,
     METRIC_DATES,
     SAMPLE_SITE_NAMES,
@@ -28,7 +29,7 @@ def sig_digit_round(value, n_digits):
     sign_mask = value < 0
     value[sign_mask] *= -1
     exponent = np.ceil(np.log10(value))
-    result = 10**exponent * np.round(value * 10 ** (-exponent), n_digits)
+    result = 10 ** exponent * np.round(value * 10 ** (-exponent), n_digits)
     result[sign_mask] *= -1
     result[zero_mask] = in_value[zero_mask]
     return result
@@ -60,21 +61,66 @@ def warn_string(df, type_dict):
 """
 
 
-def add_population(df, df_metric):
-    """Add the population column from df_metric to df, and rename some columns."""
+def reformat(df, df_metric):
+    """Add  columns from df_metric to df, and rename some columns.
+
+    Specifically the population and METRIC_SIGNAL columns, and renames date_start to timestamp.
+    """
     # drop unused columns from df_metric
-    df_population = df_metric.loc[:, ["key_plot_id", "date_start", "population_served"]]
+    df_metric_core = df_metric.loc[
+        :, ["key_plot_id", "date_start", "population_served", *METRIC_SIGNALS]
+    ]
     # get matching keys
-    df_population = df_population.rename(columns={"date_start": "timestamp"})
-    df_population = df_population.set_index(["key_plot_id", "timestamp"])
+    df_metric_core = df_metric_core.rename(columns={"date_start": "timestamp"})
+    df_metric_core = df_metric_core.set_index(["key_plot_id", "timestamp"])
     df = df.set_index(["key_plot_id", "timestamp"])
 
-    df = df.join(df_population)
+    df = df.join(df_metric_core)
     df = df.reset_index()
     return df
 
 
-def pull_nwss_data(socrata_token: str):
+def drop_unnormalized(df):
+    """Drop unnormalized.
+
+    mutate `df` to no longer have rows where the normalization scheme isn't actually identified,
+    as we can't classify the kind of signal
+    """
+    return df[~df["normalization"].isna()]
+
+
+def add_identifier_columns(df):
+    """Add identifier columns.
+
+    Add columns to get more detail than key_plot_id gives;
+    specifically, state, and `provider_normalization`, which gives the signal identifier
+    """
+    df["state"] = df.key_plot_id.str.extract(
+        r"_(\w\w)_"
+    )  # a pair of alphanumerics surrounded by _
+    df["provider"] = df.key_plot_id.str.extract(
+        r"(.*)_[a-z]{2}_"
+    )  # anything followed by state ^
+    df["signal_name"] = df.provider + "_" + df.normalization
+
+
+def check_endpoints(df):
+    """Make sure that there aren't any new signals that we need to add."""
+    # compare with existing column name checker
+    # also add a note about handling errors
+    unique_provider_norms = (
+        df[["provider", "normalization"]]
+        .drop_duplicates()
+        .sort_values(["provider", "normalization"])
+        .reset_index(drop=True)
+    )
+    if not unique_provider_norms.equals(pd.DataFrame(PROVIDER_NORMS)):
+        raise ValueError(
+            f"There are new providers and/or norms. They are\n{unique_provider_norms}"
+        )
+
+
+def pull_nwss_data(token: str):
     """Pull the latest NWSS Wastewater data, and conforms it into a dataset.
 
     The output dataset has:
@@ -95,13 +141,15 @@ def pull_nwss_data(socrata_token: str):
     pd.DataFrame
         Dataframe as described above.
     """
+    # Constants
+    keep_columns = [*SIGNALS, *METRIC_SIGNALS]
     # concentration key types
     type_dict, type_dict_metric = construct_typedicts()
 
     # Pull data from Socrata API
-    client = Socrata("data.cdc.gov", socrata_token)
-    results_concentration = client.get("g653-rqe2", limit=10**10)
-    results_metric = client.get("2ew6-ywp6", limit=10**10)
+    client = Socrata("data.cdc.gov", token)
+    results_concentration = client.get("g653-rqe2", limit=10 ** 10)
+    results_metric = client.get("2ew6-ywp6", limit=10 ** 10)
     df_metric = pd.DataFrame.from_records(results_metric)
     df_concentration = pd.DataFrame.from_records(results_concentration)
     df_concentration = df_concentration.rename(columns={"date": "timestamp"})
@@ -116,19 +164,29 @@ def pull_nwss_data(socrata_token: str):
     except KeyError as exc:
         raise ValueError(warn_string(df_metric, type_dict_metric)) from exc
 
+    # if the normalization scheme isn't recorded, why is it even included as a sample site?
+    df = drop_unnormalized(df_concentration)
     # pull 2 letter state labels out of the key_plot_id labels
-    df_concentration["state"] = df_concentration.key_plot_id.str.extract(r"_(\w\w)_")
+    add_identifier_columns(df)
 
+    # move population and metric signals over to df
+    df = reformat(df, df_metric)
     # round out some of the numeric noise that comes from smoothing
-    df_concentration[SIGNALS[0]] = sig_digit_round(
-        df_concentration[SIGNALS[0]], SIG_DIGITS
-    )
+    for signal in [*SIGNALS, *METRIC_SIGNALS]:
+        df[signal] = sig_digit_round(df[signal], SIG_DIGITS)
 
-    df_concentration = add_population(df_concentration, df_metric)
     # if there are population NA's, assume the previous value is accurate (most
     # likely introduced by dates only present in one and not the other; even
     # otherwise, best to assume some value rather than break the data)
-    df_concentration.population_served = df_concentration.population_served.ffill()
-
-    keep_columns = ["timestamp", "state", "population_served"]
-    return df_concentration[SIGNALS + keep_columns]
+    df.population_served = df.population_served.ffill()
+    check_endpoints(df)
+    keep_columns.extend(
+        [
+            "timestamp",
+            "state",
+            "population_served",
+            "normalization",
+            "provider",
+        ]
+    )
+    return df[keep_columns]
diff --git a/nwss_wastewater/delphi_nwss/run.py b/nwss_wastewater/delphi_nwss/run.py
@@ -29,7 +29,7 @@
 from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv
 from delphi_utils.nancodes import add_default_nancodes
 
-from .constants import GEOS, SIGNALS
+from .constants import GEOS, METRIC_SIGNALS, PROVIDER_NORMS, SIGNALS
 from .pull import pull_nwss_data
 
 
@@ -50,7 +50,7 @@ def generate_weights(df, column_aggregating="pcr_conc_smoothed"):
     """
     # set the weight of places with na's to zero
     df[f"relevant_pop_{column_aggregating}"] = (
-        df["population_served"] * df[column_aggregating].notna()
+        df["population_served"] * np.abs(df[column_aggregating]).notna()
     )
     # generate the weighted version
     df[f"weighted_{column_aggregating}"] = (
@@ -126,38 +126,60 @@ def run_module(params):
     export_dir = params["common"]["export_dir"]
     socrata_token = params["indicator"]["socrata_token"]
     if "archive" in params:
-        daily_arch_diff = S3ArchiveDiffer(
+        arch_diff = S3ArchiveDiffer(
             params["archive"]["cache_dir"],
             export_dir,
             params["archive"]["bucket_name"],
-            "nchs_mortality",
+            "nwss_wastewater",
             params["archive"]["aws_credentials"],
         )
-        daily_arch_diff.update_cache()
+        arch_diff.update_cache()
 
     run_stats = []
     ## build the base version of the signal at the most detailed geo level you can get.
     ## compute stuff here or farm out to another function or file
     df_pull = pull_nwss_data(socrata_token)
     ## aggregate
-    for sensor in SIGNALS:
-        df = df_pull.copy()
-        # add weighed column
-        df = generate_weights(df, sensor)
-
-        for geo in GEOS:
-            logger.info("Generating signal and exporting to CSV", metric=sensor)
-            if geo == "nation":
-                agg_df = weighted_nation_sum(df, sensor)
-            else:
-                agg_df = weighted_state_sum(df, geo, sensor)
-            # add se, sample_size, and na codes
-            agg_df = add_needed_columns(agg_df)
-            # actual export
-            dates = create_export_csv(
-                agg_df, geo_res=geo, export_dir=export_dir, sensor=sensor
-            )
-            if len(dates) > 0:
-                run_stats.append((max(dates), len(dates)))
+    # iterate over the providers and the normalizations that they specifically provide
+    for (provider, normalization) in zip(
+        PROVIDER_NORMS["provider"], PROVIDER_NORMS["normalization"]
+    ):
+        # copy by only taking the relevant subsection
+        df_prov_norm = df_pull[
+            (df_pull.provider == provider) & (df_pull.normalization == normalization)
+        ]
+        df_prov_norm = df_prov_norm.drop(["provider", "normalization"], axis=1)
+        for sensor in [*SIGNALS, *METRIC_SIGNALS]:
+            full_sensor_name = sensor + "_" + provider + "_" + normalization
+            df_prov_norm = df_prov_norm.rename(columns={sensor: full_sensor_name})
+            # add weighed column
+            df = generate_weights(df_prov_norm, full_sensor_name)
+            for geo in GEOS:
+                logger.info(
+                    "Generating signal and exporting to CSV", metric=full_sensor_name
+                )
+                if geo == "nation":
+                    agg_df = weighted_nation_sum(df, full_sensor_name)
+                else:
+                    agg_df = weighted_state_sum(df, geo, full_sensor_name)
+                # add se, sample_size, and na codes
+                agg_df = add_needed_columns(agg_df)
+                # actual export
+                dates = create_export_csv(
+                    agg_df, geo_res=geo, export_dir=export_dir, sensor=full_sensor_name
+                )
+                if "archive" in params:
+                    _, common_diffs, new_files = arch_diff.diff_exports()
+                    to_archive = [
+                        f for f, diff in common_diffs.items() if diff is not None
+                    ]
+                    to_archive += new_files
+                    _, fails = arch_diff.archive_exports(to_archive)
+                    succ_common_diffs = {
+                        f: diff for f, diff in common_diffs.items() if f not in fails
+                    }
+                    arch_diff.filter_exports(succ_common_diffs)
+                if len(dates) > 0:
+                    run_stats.append((max(dates), len(dates)))
     ## log this indicator run
     logging(start_time, run_stats, logger)
diff --git a/nwss_wastewater/tests/test_pull.py b/nwss_wastewater/tests/test_pull.py
@@ -10,9 +10,11 @@
 import pandas.api.types as ptypes
 
 from delphi_nwss.pull import (
+    add_identifier_columns,
+    check_endpoints,
     construct_typedicts,
     sig_digit_round,
-    add_population,
+    reformat,
     warn_string,
 )
 import numpy as np
@@ -111,6 +113,15 @@ def test_column_conversions_metric():
     assert all(ptypes.is_numeric_dtype(converted[flo].dtype) for flo in float_typed)
 
 
+def test_warn_string():
+    type_dict, type_dict_metric = construct_typedicts()
+    df_conc = pd.read_csv("test_data/conc_data.csv")
+    assert (
+        warn_string(df_conc, type_dict)
+        == "\nExpected column(s) missed, The dataset schema may\nhave changed. Please investigate and amend the code.\n\nColumns needed:\npcr_conc_smoothed\ntimestamp\n\nColumns available:\nUnnamed: 0\nkey_plot_id\ndate\npcr_conc_smoothed\nnormalization\n"
+    )
+
+
 def test_formatting():
     type_dict, type_dict_metric = construct_typedicts()
     df_metric = pd.read_csv("test_data/metric_data.csv", index_col=0)
@@ -132,6 +143,28 @@ def test_formatting():
                 "pcr_conc_smoothed",
                 "normalization",
                 "population_served",
+                "detect_prop_15d",
+                "percentile",
+                "ptc_15d",
             ]
         )
     )
+
+
+def test_identifier_colnames():
+    test_df = pd.read_csv("test_data/conc_data.csv", index_col=0)
+    add_identifier_columns(test_df)
+    assert all(test_df.state.unique() == ["ak", "tn"])
+    assert all(test_df.provider.unique() == ["CDC_BIOBOT", "WWS"])
+    # the only cases where the signal name is wrong is when normalization isn't defined
+    assert all(
+        (test_df.signal_name == test_df.provider + "_" + test_df.normalization)
+        | (test_df.normalization.isna())
+    )
+    assert all(
+        (
+            test_df.signal_name.unique()
+            == ["CDC_BIOBOT_flow-population", np.nan, "WWS_microbial"]
+        )
+        | (pd.isna(test_df.signal_name.unique()))
+    )
diff --git a/nwss_wastewater/tests/test_run.py b/nwss_wastewater/tests/test_run.py