7
7
8
8
from .constants import (
9
9
SIGNALS ,
10
+ PROVIDER_NORMS ,
10
11
METRIC_SIGNALS ,
11
12
METRIC_DATES ,
12
13
SAMPLE_SITE_NAMES ,
@@ -28,7 +29,7 @@ def sig_digit_round(value, n_digits):
28
29
sign_mask = value < 0
29
30
value [sign_mask ] *= - 1
30
31
exponent = np .ceil (np .log10 (value ))
31
- result = 10 ** exponent * np .round (value * 10 ** (- exponent ), n_digits )
32
+ result = 10 ** exponent * np .round (value * 10 ** (- exponent ), n_digits )
32
33
result [sign_mask ] *= - 1
33
34
result [zero_mask ] = in_value [zero_mask ]
34
35
return result
@@ -60,21 +61,66 @@ def warn_string(df, type_dict):
60
61
"""
61
62
62
63
63
- def add_population (df , df_metric ):
64
- """Add the population column from df_metric to df, and rename some columns."""
64
+ def reformat (df , df_metric ):
65
+ """Add columns from df_metric to df, and rename some columns.
66
+
67
+ Specifically the population and METRIC_SIGNAL columns, and renames date_start to timestamp.
68
+ """
65
69
# drop unused columns from df_metric
66
- df_population = df_metric .loc [:, ["key_plot_id" , "date_start" , "population_served" ]]
70
+ df_metric_core = df_metric .loc [
71
+ :, ["key_plot_id" , "date_start" , "population_served" , * METRIC_SIGNALS ]
72
+ ]
67
73
# get matching keys
68
- df_population = df_population .rename (columns = {"date_start" : "timestamp" })
69
- df_population = df_population .set_index (["key_plot_id" , "timestamp" ])
74
+ df_metric_core = df_metric_core .rename (columns = {"date_start" : "timestamp" })
75
+ df_metric_core = df_metric_core .set_index (["key_plot_id" , "timestamp" ])
70
76
df = df .set_index (["key_plot_id" , "timestamp" ])
71
77
72
- df = df .join (df_population )
78
+ df = df .join (df_metric_core )
73
79
df = df .reset_index ()
74
80
return df
75
81
76
82
77
- def pull_nwss_data (socrata_token : str ):
83
+ def drop_unnormalized (df ):
84
+ """Drop unnormalized.
85
+
86
+ mutate `df` to no longer have rows where the normalization scheme isn't actually identified,
87
+ as we can't classify the kind of signal
88
+ """
89
+ return df [~ df ["normalization" ].isna ()]
90
+
91
+
92
+ def add_identifier_columns (df ):
93
+ """Add identifier columns.
94
+
95
+ Add columns to get more detail than key_plot_id gives;
96
+ specifically, state, and `provider_normalization`, which gives the signal identifier
97
+ """
98
+ df ["state" ] = df .key_plot_id .str .extract (
99
+ r"_(\w\w)_"
100
+ ) # a pair of alphanumerics surrounded by _
101
+ df ["provider" ] = df .key_plot_id .str .extract (
102
+ r"(.*)_[a-z]{2}_"
103
+ ) # anything followed by state ^
104
+ df ["signal_name" ] = df .provider + "_" + df .normalization
105
+
106
+
107
+ def check_endpoints (df ):
108
+ """Make sure that there aren't any new signals that we need to add."""
109
+ # compare with existing column name checker
110
+ # also add a note about handling errors
111
+ unique_provider_norms = (
112
+ df [["provider" , "normalization" ]]
113
+ .drop_duplicates ()
114
+ .sort_values (["provider" , "normalization" ])
115
+ .reset_index (drop = True )
116
+ )
117
+ if not unique_provider_norms .equals (pd .DataFrame (PROVIDER_NORMS )):
118
+ raise ValueError (
119
+ f"There are new providers and/or norms. They are\n { unique_provider_norms } "
120
+ )
121
+
122
+
123
+ def pull_nwss_data (token : str ):
78
124
"""Pull the latest NWSS Wastewater data, and conforms it into a dataset.
79
125
80
126
The output dataset has:
@@ -95,13 +141,15 @@ def pull_nwss_data(socrata_token: str):
95
141
pd.DataFrame
96
142
Dataframe as described above.
97
143
"""
144
+ # Constants
145
+ keep_columns = [* SIGNALS , * METRIC_SIGNALS ]
98
146
# concentration key types
99
147
type_dict , type_dict_metric = construct_typedicts ()
100
148
101
149
# Pull data from Socrata API
102
- client = Socrata ("data.cdc.gov" , socrata_token )
103
- results_concentration = client .get ("g653-rqe2" , limit = 10 ** 10 )
104
- results_metric = client .get ("2ew6-ywp6" , limit = 10 ** 10 )
150
+ client = Socrata ("data.cdc.gov" , token )
151
+ results_concentration = client .get ("g653-rqe2" , limit = 10 ** 10 )
152
+ results_metric = client .get ("2ew6-ywp6" , limit = 10 ** 10 )
105
153
df_metric = pd .DataFrame .from_records (results_metric )
106
154
df_concentration = pd .DataFrame .from_records (results_concentration )
107
155
df_concentration = df_concentration .rename (columns = {"date" : "timestamp" })
@@ -116,19 +164,29 @@ def pull_nwss_data(socrata_token: str):
116
164
except KeyError as exc :
117
165
raise ValueError (warn_string (df_metric , type_dict_metric )) from exc
118
166
167
+ # if the normalization scheme isn't recorded, why is it even included as a sample site?
168
+ df = drop_unnormalized (df_concentration )
119
169
# pull 2 letter state labels out of the key_plot_id labels
120
- df_concentration [ "state" ] = df_concentration . key_plot_id . str . extract ( r"_(\w\w)_" )
170
+ add_identifier_columns ( df )
121
171
172
+ # move population and metric signals over to df
173
+ df = reformat (df , df_metric )
122
174
# round out some of the numeric noise that comes from smoothing
123
- df_concentration [SIGNALS [0 ]] = sig_digit_round (
124
- df_concentration [SIGNALS [0 ]], SIG_DIGITS
125
- )
175
+ for signal in [* SIGNALS , * METRIC_SIGNALS ]:
176
+ df [signal ] = sig_digit_round (df [signal ], SIG_DIGITS )
126
177
127
- df_concentration = add_population (df_concentration , df_metric )
128
178
# if there are population NA's, assume the previous value is accurate (most
129
179
# likely introduced by dates only present in one and not the other; even
130
180
# otherwise, best to assume some value rather than break the data)
131
- df_concentration .population_served = df_concentration .population_served .ffill ()
132
-
133
- keep_columns = ["timestamp" , "state" , "population_served" ]
134
- return df_concentration [SIGNALS + keep_columns ]
181
+ df .population_served = df .population_served .ffill ()
182
+ check_endpoints (df )
183
+ keep_columns .extend (
184
+ [
185
+ "timestamp" ,
186
+ "state" ,
187
+ "population_served" ,
188
+ "normalization" ,
189
+ "provider" ,
190
+ ]
191
+ )
192
+ return df [keep_columns ]
0 commit comments