diff --git a/docs/src/format/table/index/scalar/label_list.md b/docs/src/format/table/index/scalar/label_list.md index 8d50f2638b0..1c5cb5cdaa1 100644 --- a/docs/src/format/table/index/scalar/label_list.md +++ b/docs/src/format/table/index/scalar/label_list.md @@ -26,7 +26,8 @@ The label list index uses a bitmap index internally and stores its data in: The label list index provides exact results for the following query types: -| Query Type | Description | Operation | Result Type | -|----------------------|----------------------------------------|---------------------------------------------|-------------| -| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact | -| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact | \ No newline at end of file +| Query Type | Description | Operation | Result Type | +|-------------------------------------|----------------------------------------|---------------------------------------------|-------------| +| **array_has / array_contains** | Array contains the specified value | Bitmap lookup for a single label | Exact | +| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact | +| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact | diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 01066a88d34..c23e1bcfc43 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2411,8 +2411,9 @@ def create_scalar_index( * ``LABEL_LIST``. A special index that is used to index list columns whose values have small cardinality. For example, a column that contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed - with a ``LABEL_LIST`` index. This index can only speedup queries with - ``array_has_any`` or ``array_has_all`` filters. + with a ``LABEL_LIST`` index. This index can speed up list membership + filters such as ``array_has_any``, ``array_has_all``, and + ``array_has`` / ``array_contains``. * ``NGRAM``. A special index that is used to index string columns. This index creates a bitmap for each ngram in the string. By default we use trigrams. This index can currently speed up queries using the ``contains`` function diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 443172db13e..d81c89b0dff 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1977,6 +1977,39 @@ def test_label_list_index(tmp_path: Path): assert indices[0]["type"] == "LabelList" +def test_label_list_index_array_contains(tmp_path: Path): + # Include lists with NULL items to ensure NULL needle behavior matches + # non-index execution. + tbl = pa.table( + {"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], [], None]} + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + expected_null_rows = dataset.to_table( + filter="array_contains(labels, NULL)" + ).num_rows + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + result = dataset.to_table(filter="array_contains(labels, 'foo')") + assert result.num_rows == 1 + + result = dataset.to_table(filter="array_contains(labels, 'bar')") + assert result.num_rows == 2 + + result = dataset.to_table(filter="array_contains(labels, 'oof')") + assert result.num_rows == 0 + + explain = dataset.scanner(filter="array_contains(labels, 'foo')").explain_plan() + assert "ScalarIndexQuery" in explain + + # NULL needle: preserve semantics (must match pre-index execution) and avoid + # using the LABEL_LIST index. + actual_null_rows = dataset.to_table(filter="array_contains(labels, NULL)").num_rows + assert actual_null_rows == expected_null_rows + explain = dataset.scanner(filter="array_contains(labels, NULL)").explain_plan() + assert "ScalarIndexQuery" not in explain + + def test_create_index_empty_dataset(tmp_path: Path): # Creating an index on an empty dataset is (currently) not terribly useful but # we shouldn't return strange errors. diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index a1ea6fe84ff..6aacfc565d0 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -490,6 +490,26 @@ impl ScalarQueryParser for LabelListQueryParser { if args.len() != 2 { return None; } + // DataFusion normalizes array_contains to array_has + if func.name() == "array_has" { + let inner_type = match data_type { + DataType::List(field) | DataType::LargeList(field) => field.data_type(), + _ => return None, + }; + let scalar = maybe_scalar(&args[1], inner_type)?; + // array_has(..., NULL) returns no matches in datafusion, but the index would + // match rows containing NULL. Fallback to match datafusion behavior. + if scalar.is_null() { + return None; + } + let query = LabelListQuery::HasAnyLabel(vec![scalar]); + return Some(IndexedExpression::index_query( + column.to_string(), + self.index_name.clone(), + Arc::new(query), + )); + } + let label_list = maybe_scalar(&args[1], data_type)?; if let ScalarValue::List(list_arr) = label_list { let list_values = list_arr.values(); @@ -1651,6 +1671,7 @@ fn visit_node( } match expr { Expr::Between(between) => Ok(visit_between(between, index_info)), + Expr::Alias(alias) => visit_node(alias.expr.as_ref(), index_info, depth), Expr::Column(_) => Ok(visit_column(expr, index_info)), Expr::InList(in_list) => Ok(visit_in_list(in_list, index_info)), Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)), diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index b9850b3c01c..0cfd00d4866 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -57,8 +57,8 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf { impl LabelListSubIndex for T {} /// A scalar index that can be used on `List` columns to -/// support queries with array_contains_all and array_contains_any -/// using an underlying bitmap index. +/// accelerate list membership filters such as `array_has_all`, `array_has_any`, +/// and `array_has` / `array_contains`, using an underlying bitmap index. #[derive(Clone, Debug, DeepSizeOf)] pub struct LabelListIndex { values_index: Arc,