forked from DPBayes/apc-anonymizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampling.py
47 lines (37 loc) · 1.71 KB
/
sampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import secrets
import numpy as np
import pandas as pd
def clamp(n, smallest, largest):
"""Force a value between the smallest and largest allowed value."""
return max(smallest, min(n, largest))
def create_sampler(csv_path_or_buffer):
"""Create a sample function for choosing the occupancy status value.
Run create_sampler once for each vehicle model that you have a profile for.
"""
probabilities_df = pd.read_csv(
csv_path_or_buffer, index_col="passenger_count"
)
min_count = probabilities_df.index[0]
max_count = probabilities_df.index[-1]
# Let's normalize the probabilities. They should already be normalized but
# due to the floating-point serialization into CSV and deserialization out
# of CSV there might be a small difference.
# picky about normalization.
probabilities = probabilities_df.values
cdf = np.cumsum(probabilities, axis=1)
normalized_cdf = cdf * (1.0 / cdf[:, -1])[:, np.newaxis]
categories = probabilities_df.columns
# Generate cryptographically strong random numbers.
generator = secrets.SystemRandom()
def sample(passenger_count):
"""Sample from the probabilities to produce an occupancy status.
When you receive new passenger count data and need to update the
published occupancy status, e.g. after every stop with changes in
passenger count, call this function with the current passenger_count
and publish the result.
"""
clamped_count = clamp(passenger_count, min_count, max_count)
cdf_given_passenger_count = normalized_cdf[clamped_count, :]
p = generator.random()
return categories[np.searchsorted(cdf_given_passenger_count, p)]
return sample