Skip to content

Commit 1114958

Browse files
larry77thisisnic
andauthored
GH-49186: [R] Support dplyr::filter_out() in Arrow dplyr backend (#49256)
### Rationale for this change New function in dplyr not yet implemented in Arrow ### What changes are included in this PR? This PR adds support for dplyr::filter_out() in the Arrow R dplyr backend. The implementation reuses the existing filter() machinery and extends set_filters() with an `exclude` flag. When exclude = TRUE, the predicate is transformed to match dplyr semantics (drop rows where predicate is TRUE, keep rows where predicate is FALSE or NA). Multiple filter_out() predicates are combined before exclusion so that filter_out(a, b) matches dplyr semantics (i.e. drop rows where a & b is TRUE). This works for arrow_table(), RecordBatchReader, and open_dataset(), and preserves lazy evaluation for larger-than-memory datasets. Tests are added to verify basic behavior, NA handling, and multiple predicates. Note: local test run hits a with_language() locale issue ('.cache' not found), which appears environment-specific and unrelated to these changes. ### Are these changes tested? Yes ### Are there any user-facing changes? Just the new function * GitHub Issue: #49257 * GitHub Issue: #49186 Lead-authored-by: Lorenzo Isella <lorenzo.isella@gmail.com> Co-authored-by: Nic Crane <thisisnic@gmail.com> Co-authored-by: Lorenzo ISELLA <lorenzo.isella@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
1 parent 3e6988a commit 1114958

5 files changed

Lines changed: 154 additions & 24 deletions

File tree

r/R/arrow-package.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
supported_dplyr_methods <- list(
3939
select = NULL,
4040
filter = NULL,
41+
filter_out = NULL,
4142
collect = NULL,
4243
summarise = c(
4344
"window functions not currently supported;",

r/R/dplyr-filter.R

Lines changed: 100 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,61 @@
1717

1818
# The following S3 methods are registered on load if dplyr is present
1919

20-
filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) {
21-
try_arrow_dplyr({
22-
# TODO something with the .preserve argument
23-
out <- as_adq(.data)
20+
apply_filter_impl <- function(
21+
.data,
22+
...,
23+
.by = NULL,
24+
.preserve = FALSE,
25+
negate = FALSE
26+
) {
27+
# TODO something with the .preserve argument
28+
out <- as_adq(.data)
2429

25-
by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data")
30+
by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data")
2631

27-
if (by$from_by) {
28-
out$group_by_vars <- by$names
29-
}
32+
if (by$from_by) {
33+
out$group_by_vars <- by$names
34+
}
35+
36+
expanded_filters <- expand_across(out, quos(...))
37+
if (length(expanded_filters) == 0) {
38+
# Nothing to do
39+
return(as_adq(.data))
40+
}
41+
42+
# tidy-eval the filter expressions inside an Arrow data_mask
43+
mask <- arrow_mask(out)
44+
45+
if (isTRUE(negate)) {
46+
# filter_out(): combine all predicates with &, then negate
47+
combined <- NULL
48+
49+
for (expr in expanded_filters) {
50+
filt <- arrow_eval(expr, mask)
3051

31-
expanded_filters <- expand_across(out, quos(...))
32-
if (length(expanded_filters) == 0) {
33-
# Nothing to do
34-
return(as_adq(.data))
52+
if (length(mask$.aggregations)) {
53+
# dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it.
54+
# But we could, the same way it works in mutate() via join, if someone asks.
55+
# Until then, just error.
56+
arrow_not_supported(
57+
.actual_msg = "Expression not supported in filter_out() in Arrow",
58+
call = expr
59+
)
60+
}
61+
62+
if (is_list_of(filt, "Expression")) {
63+
filt <- Reduce("&", filt)
64+
}
65+
66+
combined <- if (is.null(combined)) filt else (combined & filt)
3567
}
3668

37-
# tidy-eval the filter expressions inside an Arrow data_mask
38-
mask <- arrow_mask(out)
69+
out <- set_filters(out, combined, negate = TRUE)
70+
} else {
71+
# filter(): apply each predicate sequentially
3972
for (expr in expanded_filters) {
4073
filt <- arrow_eval(expr, mask)
74+
4175
if (length(mask$.aggregations)) {
4276
# dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it.
4377
# But we could, the same way it works in mutate() via join, if someone asks.
@@ -47,27 +81,72 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE)
4781
call = expr
4882
)
4983
}
50-
out <- set_filters(out, filt)
51-
}
5284

53-
if (by$from_by) {
54-
out$group_by_vars <- character()
85+
out <- set_filters(out, filt, negate = FALSE)
5586
}
87+
}
88+
89+
if (by$from_by) {
90+
out$group_by_vars <- character()
91+
}
5692

57-
out
93+
out
94+
}
95+
96+
filter.arrow_dplyr_query <- function(
97+
.data,
98+
...,
99+
.by = NULL,
100+
.preserve = FALSE
101+
) {
102+
try_arrow_dplyr({
103+
apply_filter_impl(
104+
.data,
105+
...,
106+
.by = {{ .by }},
107+
.preserve = .preserve,
108+
negate = FALSE
109+
)
58110
})
59111
}
60112
filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query
61113

62-
set_filters <- function(.data, expressions) {
114+
filter_out.arrow_dplyr_query <- function(
115+
.data,
116+
...,
117+
.by = NULL,
118+
.preserve = FALSE
119+
) {
120+
try_arrow_dplyr({
121+
apply_filter_impl(
122+
.data,
123+
...,
124+
.by = {{ .by }},
125+
.preserve = .preserve,
126+
negate = TRUE
127+
)
128+
})
129+
}
130+
filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query
131+
132+
set_filters <- function(.data, expressions, negate = FALSE) {
63133
if (length(expressions)) {
64134
if (is_list_of(expressions, "Expression")) {
65135
# expressions is a list of Expressions. AND them together and set them on .data
66136
new_filter <- Reduce("&", expressions)
67137
} else if (inherits(expressions, "Expression")) {
68138
new_filter <- expressions
69139
} else {
70-
stop("filter expressions must be either an expression or a list of expressions", call. = FALSE)
140+
stop(
141+
"filter expressions must be either an expression or a list of expressions",
142+
call. = FALSE
143+
)
144+
}
145+
146+
if (isTRUE(negate)) {
147+
# dplyr::filter_out() semantics: drop rows where predicate is TRUE;
148+
# keep rows where predicate is FALSE or NA.
149+
new_filter <- (!new_filter) | is.na(new_filter)
71150
}
72151

73152
if (isTRUE(.data$filtered_rows)) {

r/R/dplyr-funcs-doc.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
#' Functions available in Arrow dplyr queries
2121
#'
22-
#' The `arrow` package contains methods for 37 `dplyr` table functions, many of
22+
#' The `arrow` package contains methods for 38 `dplyr` table functions, many of
2323
#' which are "verbs" that do transformations to one or more tables.
2424
#' The package also has mappings of 224 R functions to the corresponding
2525
#' functions in the Arrow compute library. These allow you to write code inside
@@ -45,6 +45,7 @@
4545
#' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` returns a non-missing value if present, only returning missing values if all are missing.
4646
#' * [`explain()`][dplyr::explain()]
4747
#' * [`filter()`][dplyr::filter()]
48+
#' * [`filter_out()`][dplyr::filter_out()]
4849
#' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored
4950
#' * [`glimpse()`][dplyr::glimpse()]
5051
#' * [`group_by()`][dplyr::group_by()]

r/man/acero.Rd

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/tests/testthat/test-dplyr-filter.R

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,3 +498,51 @@ test_that("filter() with aggregation expressions errors", {
498498
"not supported in filter"
499499
)
500500
})
501+
502+
test_that("filter_out() basic", {
503+
compare_dplyr_binding(
504+
.input |>
505+
filter_out(chr == "b") |>
506+
select(chr, int, lgl) |>
507+
collect(),
508+
tbl
509+
)
510+
})
511+
512+
test_that("filter_out() keeps NA values in predicate result", {
513+
compare_dplyr_binding(
514+
.input |>
515+
filter_out(lgl) |>
516+
select(chr, int, lgl) |>
517+
collect(),
518+
tbl
519+
)
520+
})
521+
522+
test_that("filter_out() with multiple conditions", {
523+
compare_dplyr_binding(
524+
.input |>
525+
filter_out(dbl > 2, chr %in% c("d", "f")) |>
526+
collect(),
527+
tbl
528+
)
529+
})
530+
531+
test_that("More complex select/filter_out", {
532+
compare_dplyr_binding(
533+
.input |>
534+
filter_out(dbl > 2, chr == "d" | chr == "f") |>
535+
select(chr, int, lgl) |>
536+
filter(int < 5) |>
537+
select(int, chr) |>
538+
collect(),
539+
tbl
540+
)
541+
542+
compare_dplyr_binding(
543+
.input |>
544+
filter_out(!is.na(int)) |>
545+
collect(),
546+
tbl
547+
)
548+
})

0 commit comments

Comments
 (0)