@@ -183,6 +183,37 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
183183 }
184184}
185185
186+ # ' Variation on [`dplyr::anti_join`] for speed + tolerance setting
187+ # '
188+ # ' @param x tibble; `x[ukey_names]` must not have any duplicate rows
189+ # ' @param y tibble; `y[ukey_names]` must not have any duplicate rows
190+ # ' @param ukey_names chr; names of columns that form a unique key, for `x` and
191+ # ' for `y`
192+ # ' @param val_names chr; names of columns which should be treated as
193+ # ' value/measurement columns, and compared with a tolerance
194+ # ' @param abs_tol scalar non-negative numeric; absolute tolerance with which to
195+ # ' compare value columns; see [`vec_approx_equal`]
196+ # ' @return rows from `x` that either (a) don't have a (0-tolerance) matching
197+ # ' ukey in `y`, or (b) have a matching ukey in `y`, but don't have
198+ # ' approximately equal value column values
199+ tbl_fast_anti_join <- function (x , y , ukey_names , val_names , abs_tol = 0 ) {
200+ x <- x [c(ukey_names , val_names )]
201+ y <- y [c(ukey_names , val_names )]
202+ xy <- vec_rbind(x , y )
203+ if (abs_tol == 0 ) {
204+ x_exclude <- vec_duplicate_detect(xy )
205+ x_exclude <- vec_slice(x_exclude , seq_len(nrow(x )))
206+ } else {
207+ xy_dup_ids <- vec_duplicate_id(xy [ukey_names ])
208+ xy_dup_inds2 <- which(xy_dup_ids != seq_along(xy_dup_ids ))
209+ xy_dup_inds1 <- xy_dup_ids [xy_dup_inds2 ]
210+ x_exclude <- rep(FALSE , nrow(x ))
211+ xy_vals <- xy [val_names ]
212+ x_exclude [xy_dup_inds1 ] <- vec_approx_equal(xy_vals , inds1 = xy_dup_inds2 , xy_vals , inds2 = xy_dup_inds1 , na_equal = TRUE , abs_tol = abs_tol )
213+ }
214+ vec_slice(x , ! x_exclude )
215+ }
216+
186217# ' Calculate compact patch to move from one snapshot/update to another
187218# '
188219# ' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no
@@ -214,117 +245,41 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
214245 # use faster validation variants:
215246 if (! is_tibble(later_tbl )) {
216247 cli_abort(" `later_tbl` must be a tibble" ,
217- class = " epiprocess__tbl_diff2__later_tbl_invalid"
218- )
248+ class = " epiprocess__tbl_diff2__later_tbl_invalid"
249+ )
219250 }
220251 if (is.null(earlier_snapshot )) {
221252 return (later_tbl )
222253 }
223254 if (! is_tibble(earlier_snapshot )) {
224255 cli_abort(" `earlier_snapshot` must be a tibble or `NULL`" ,
225- class = " epiprocess__tbl_diff2__earlier_tbl_class_invalid"
226- )
256+ class = " epiprocess__tbl_diff2__earlier_tbl_class_invalid"
257+ )
227258 }
228259 if (! is.character(ukey_names ) || ! all(ukey_names %in% names(earlier_snapshot ))) {
229260 cli_abort(" `ukey_names` must be a subset of column names" ,
230- class = " epiprocess__tbl_diff2__ukey_names_class_invalid"
231- )
261+ class = " epiprocess__tbl_diff2__ukey_names_class_invalid"
262+ )
232263 }
233264 later_format <- arg_match0(later_format , c(" snapshot" , " update" ))
234265 if (! (is.vector(compactify_abs_tol , mode = " numeric" ) &&
235- length(compactify_abs_tol ) == 1L && # nolint:indentation_linter
236- compactify_abs_tol > = 0 )) {
266+ length(compactify_abs_tol ) == 1L && # nolint:indentation_linter
267+ compactify_abs_tol > = 0 )) {
237268 # Give a specific message:
238269 assert_numeric(compactify_abs_tol , lower = 0 , any.missing = FALSE , len = 1L )
239270 # Fallback e.g. for invalid classes not caught by assert_numeric:
240271 cli_abort(" `compactify_abs_tol` must be a length-1 double/integer >= 0" ,
241- class = " epiprocess__tbl_diff2__compactify_abs_tol_invalid"
242- )
272+ class = " epiprocess__tbl_diff2__compactify_abs_tol_invalid" )
243273 }
244274
245- # Extract metadata:
246- earlier_n <- nrow(earlier_snapshot )
247- later_n <- nrow(later_tbl )
248- tbl_names <- names(earlier_snapshot )
249- val_names <- tbl_names [! tbl_names %in% ukey_names ]
250-
251- # More input validation:
252- if (! identical(tbl_names , names(later_tbl ))) {
253- cli_abort(c(
254- " `earlier_snapshot` and `later_tbl` should have identical column
255- names and ordering." ,
256- " *" = " `earlier_snapshot` colnames: {format_chr_deparse(tbl_names)}" ,
257- " *" = " `later_tbl` colnames: {format_chr_deparse(names(later_tbl))}"
258- ), class = " epiprocess__tbl_diff2__tbl_names_differ" )
275+ all_names <- names(later_tbl )
276+ val_names <- all_names [! (all_names %in% ukey_names )]
277+ updates <- tbl_fast_anti_join(later_tbl , earlier_snapshot , ukey_names , val_names , compactify_abs_tol )
278+ if (later_format == " snapshot" ) {
279+ deletions <- tbl_fast_anti_join(earlier_snapshot [ukey_names ], later_tbl [ukey_names ], ukey_names , character (), 0 )
280+ updates <- vec_rbind(updates , deletions ) # fills vals with NAs
259281 }
260-
261- combined_tbl <- vec_rbind(earlier_snapshot , later_tbl )
262- combined_n <- nrow(combined_tbl )
263-
264- # We'll also need epikeytimes and value columns separately:
265- combined_ukeys <- combined_tbl [ukey_names ]
266- combined_vals <- combined_tbl [val_names ]
267-
268- # We have five types of rows in combined_tbl:
269- # 1. From earlier_snapshot, no matching ukey in later_tbl (deletion; turn vals to
270- # NAs to match epi_archive format)
271- # 2. From earlier_snapshot, with matching ukey in later_tbl (context; exclude from
272- # result)
273- # 3. From later_tbl, with matching ukey in earlier_snapshot, with value "close" (change
274- # that we'll compactify away)
275- # 4. From later_tbl, with matching ukey in earlier_snapshot, value not "close" (change
276- # that we'll record)
277- # 5. From later_tbl, with no matching ukey in later_tbl (addition)
278-
279- # For "snapshot" later_format, we need to filter to 1., 4., and 5., and alter
280- # values for 1. For "update" later_format, we need to filter to 4. and 5.
281-
282- # (For compactify_abs_tol = 0, we could potentially streamline things by dropping
283- # ukey+val duplicates (cases 2. and 3.).)
284-
285- # Row indices of first occurrence of each ukey; will be the same as
286- # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
287- # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
288- # the row index of the same ukey in `earlier_snapshot`:
289- combined_ukey_firsts <- vec_duplicate_id(combined_ukeys )
290-
291- # Which rows from combined are cases 3. or 4.?
292- combined_ukey_is_repeat <- combined_ukey_firsts != seq_len(combined_n )
293- # For each row in 3. or 4., row numbers of the ukey appearance in earlier:
294- ukey_repeat_first_i <- combined_ukey_firsts [combined_ukey_is_repeat ]
295-
296- # Which rows from combined are in case 3.?
297- combined_compactify_away <- rep(FALSE , combined_n )
298- combined_compactify_away [combined_ukey_is_repeat ] <-
299- vec_approx_equal0(combined_vals ,
300- combined_vals ,
301- na_equal = TRUE ,
302- abs_tol = compactify_abs_tol ,
303- inds1 = combined_ukey_is_repeat ,
304- inds2 = ukey_repeat_first_i
305- )
306-
307- # Which rows from combined are in cases 3., 4., or 5.?
308- combined_from_later <- vec_rep_each(c(FALSE , TRUE ), c(earlier_n , later_n ))
309-
310- if (later_format == " update" ) {
311- # Cases 4. and 5.:
312- combined_tbl <- combined_tbl [combined_from_later & ! combined_compactify_away , ]
313- } else { # later_format is "snapshot"
314- # Which rows from combined are in case 1.?
315- combined_is_deletion <- vec_rep_each(c(TRUE , FALSE ), c(earlier_n , later_n ))
316- combined_is_deletion [ukey_repeat_first_i ] <- FALSE
317- # Which rows from combined are in cases 1., 4., or 5.?
318- combined_include <- combined_is_deletion | combined_from_later & ! combined_compactify_away
319- combined_tbl <- combined_tbl [combined_include , ]
320- # Represent deletion in 1. with NA-ing of all value columns. (In some
321- # previous approaches to epi_diff2, this seemed to be faster than using
322- # vec_rbind(case_1_ukeys, cases_45_tbl) or bind_rows to fill with NAs, and more
323- # general than data.table's rbind(case_1_ukeys, cases_45_tbl, fill = TRUE).)
324- combined_tbl [combined_is_deletion [combined_include ], val_names ] <- NA
325- }
326-
327- combined_tbl
282+ updates
328283}
329284
330285# ' Apply an update (e.g., from `tbl_diff2`) to a snapshot
0 commit comments