Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 332296b

Browse files
authoredOct 21, 2021
Merge pull request #264 from diffix/edon/performance
Improve count distinct performance
2 parents 04ba96e + 9e7865c commit 332296b

File tree

1 file changed

+43
-23
lines changed

1 file changed

+43
-23
lines changed
 

‎src/OpenDiffix.Core/Anonymizer.fs

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ type private AidCount = { FlattenedSum: float; Flattening: float; NoiseSD: float
9999
let inline private aidFlattening
100100
(executionContext: ExecutionContext)
101101
(unaccountedFor: int64)
102-
(aidContributions: (AidHash * ^Contribution) list)
102+
(aidContributions: (AidHash * ^Contribution) array)
103103
: AidCount option =
104104
let anonParams = executionContext.AnonymizationParams
105105

@@ -109,30 +109,30 @@ let inline private aidFlattening
109109
let outlierInterval, topInterval =
110110
compactFlatteningIntervals anonParams.OutlierCount anonParams.TopCount aidContributions.Length
111111

112-
let sortedAidContributions = aidContributions |> List.sortByDescending snd
112+
let sortedAidContributions = aidContributions |> Array.sortByDescending snd
113113

114114
let flatSeed =
115115
sortedAidContributions
116-
|> List.take (outlierInterval.Upper + topInterval.Upper)
116+
|> Seq.take (outlierInterval.Upper + topInterval.Upper)
117117
|> Seq.map fst
118118
|> seedFromAidSet
119119
|> cryptoHashSaltedSeed anonParams.Salt
120120

121121
let outlierCount = flatSeed |> mixSeed "outlier" |> randomUniform outlierInterval
122122
let topCount = flatSeed |> mixSeed "top" |> randomUniform topInterval
123123

124-
let outliersSummed = sortedAidContributions |> List.take outlierCount |> List.sumBy snd
124+
let outliersSummed = sortedAidContributions |> Seq.take outlierCount |> Seq.sumBy snd
125125

126126
let topGroupValuesSummed =
127127
sortedAidContributions
128-
|> List.skip outlierCount
129-
|> List.take topCount
130-
|> List.sumBy snd
128+
|> Seq.skip outlierCount
129+
|> Seq.take topCount
130+
|> Seq.sumBy snd
131131

132132
let topGroupAverage = (float topGroupValuesSummed) / (float topCount)
133133
let outlierReplacement = topGroupAverage * (float outlierCount)
134134

135-
let summedContributions = aidContributions |> List.sumBy snd
135+
let summedContributions = aidContributions |> Array.sumBy snd
136136
let flattening = float outliersSummed - outlierReplacement
137137
let flattenedUnaccountedFor = float unaccountedFor - flattening |> max 0.
138138
let flattenedSum = float summedContributions - flattening
@@ -171,28 +171,48 @@ let private transposeToPerAid (aidsPerValue: KeyValuePair<Value, HashSet<AidHash
171171

172172
result
173173

174-
let rec private distributeValues valuesByAID =
175-
match valuesByAID with
176-
| [] -> [] // Done :D
177-
| (_aid, []) :: restValuesByAID -> distributeValues restValuesByAID
178-
| (aid, value :: restValues) :: restValuesByAID ->
179-
let restValuesByAID = // Drop current value from the remaining items.
180-
List.map (fun (aid, values) -> aid, values |> List.filter ((<>) value)) restValuesByAID
174+
let private distributeValues (valuesByAID: seq<AidHash * array<Value>>) : seq<AidHash * Value> =
175+
let usedValues = HashSet<Value>()
181176

182-
(aid, value) :: distributeValues (restValuesByAID @ [ aid, restValues ])
177+
let rec pickUnusedValue (values: Stack<Value>) =
178+
match values.TryPop() with
179+
| true, value -> if usedValues.Contains(value) then pickUnusedValue values else ValueSome value
180+
| false, _ -> ValueNone
181+
182+
let result = List<AidHash * Value>()
183+
184+
let mutable remainingItems =
185+
valuesByAID
186+
|> Seq.filter (fun (_aid, values) -> values.Length > 0)
187+
|> Seq.map (fun (aid, values) -> aid, Stack<Value>(values))
188+
|> Seq.toArray
189+
190+
while remainingItems.Length > 0 do
191+
remainingItems <-
192+
remainingItems
193+
|> Array.filter (fun (aid, values) ->
194+
match pickUnusedValue values with
195+
| ValueSome value ->
196+
result.Add((aid, value))
197+
usedValues.Add(value) |> ignore
198+
values.Count > 0
199+
| ValueNone -> false
200+
)
201+
202+
result :> seq<AidHash * Value>
183203

184204
let private countDistinctFlatteningByAid
185205
(executionContext: ExecutionContext)
186206
(perAidContributions: Dictionary<AidHash, HashSet<Value>>)
187207
=
188208
perAidContributions
189209
// keep low count values in sorted order to ensure the algorithm is deterministic
190-
|> Seq.map (fun pair -> pair.Key, pair.Value |> Seq.toList)
210+
|> Seq.map (fun pair -> pair.Key, pair.Value |> Seq.toArray)
191211
|> Seq.sortBy (fun (aid, values) -> values.Length, aid)
192-
|> Seq.toList
193212
|> distributeValues
194-
|> List.countBy fst
195-
|> List.map (fun (aid, count) -> aid, int64 count)
213+
|> Seq.countBy fst
214+
|> Seq.map (fun (aid, count) -> aid, int64 count)
215+
|> Seq.toArray
196216
|> aidFlattening executionContext 0L
197217

198218
let private anonymizedSum (byAidSum: AidCount seq) =
@@ -230,8 +250,8 @@ let countDistinct
230250
// without any additional noise.
231251
let lowCountValues, highCountValues =
232252
aidsPerValue
233-
|> Seq.toList
234-
|> List.partition (fun pair -> isLowCount executionContext pair.Value)
253+
|> Seq.toArray
254+
|> Array.partition (fun pair -> isLowCount executionContext pair.Value)
235255

236256
let byAid =
237257
[ 0 .. aidsCount - 1 ]
@@ -262,7 +282,7 @@ let count (executionContext: ExecutionContext) (perAidContributions: AidCountSta
262282
|> Array.map (fun aidState ->
263283
aidState.AidContributions
264284
|> Seq.map (fun pair -> pair.Key, pair.Value)
265-
|> Seq.toList
285+
|> Seq.toArray
266286
|> aidFlattening executionContext aidState.UnaccountedFor
267287
)
268288

0 commit comments

Comments
 (0)
Please sign in to comment.