Skip to content
9 changes: 5 additions & 4 deletions docs/src/format/table/index/scalar/label_list.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ The label list index uses a bitmap index internally and stores its data in:

The label list index provides exact results for the following query types:

| Query Type | Description | Operation | Result Type |
|----------------------|----------------------------------------|---------------------------------------------|-------------|
| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact |
| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact |
| Query Type | Description | Operation | Result Type |
|-------------------------------------|----------------------------------------|---------------------------------------------|-------------|
| **array_has / array_contains** | Array contains the specified value | Bitmap lookup for a single label | Exact |
| **array_has_all** | Array contains all specified values | Intersects bitmaps for all specified labels | Exact |
| **array_has_any** | Array contains any of specified values | Unions bitmaps for all specified labels | Exact |
5 changes: 3 additions & 2 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,8 +2411,9 @@ def create_scalar_index(
* ``LABEL_LIST``. A special index that is used to index list
columns whose values have small cardinality. For example, a column that
contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed
with a ``LABEL_LIST`` index. This index can only speedup queries with
``array_has_any`` or ``array_has_all`` filters.
with a ``LABEL_LIST`` index. This index can speed up list membership
filters such as ``array_has_any``, ``array_has_all``, and
``array_has`` / ``array_contains``.
* ``NGRAM``. A special index that is used to index string columns. This index
creates a bitmap for each ngram in the string. By default we use trigrams.
This index can currently speed up queries using the ``contains`` function
Expand Down
33 changes: 33 additions & 0 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1977,6 +1977,39 @@ def test_label_list_index(tmp_path: Path):
assert indices[0]["type"] == "LabelList"


def test_label_list_index_array_contains(tmp_path: Path):
# Include lists with NULL items to ensure NULL needle behavior matches
# non-index execution.
tbl = pa.table(
{"labels": [["foo", "bar"], ["bar"], ["baz"], ["qux", None], [None], [], None]}
)
dataset = lance.write_dataset(tbl, tmp_path / "dataset")
expected_null_rows = dataset.to_table(
filter="array_contains(labels, NULL)"
).num_rows

dataset.create_scalar_index("labels", index_type="LABEL_LIST")

result = dataset.to_table(filter="array_contains(labels, 'foo')")
assert result.num_rows == 1

result = dataset.to_table(filter="array_contains(labels, 'bar')")
assert result.num_rows == 2

result = dataset.to_table(filter="array_contains(labels, 'oof')")
assert result.num_rows == 0

explain = dataset.scanner(filter="array_contains(labels, 'foo')").explain_plan()
assert "ScalarIndexQuery" in explain

# NULL needle: preserve semantics (must match pre-index execution) and avoid
# using the LABEL_LIST index.
actual_null_rows = dataset.to_table(filter="array_contains(labels, NULL)").num_rows
assert actual_null_rows == expected_null_rows
explain = dataset.scanner(filter="array_contains(labels, NULL)").explain_plan()
assert "ScalarIndexQuery" not in explain


def test_create_index_empty_dataset(tmp_path: Path):
# Creating an index on an empty dataset is (currently) not terribly useful but
# we shouldn't return strange errors.
Expand Down
21 changes: 21 additions & 0 deletions rust/lance-index/src/scalar/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,26 @@ impl ScalarQueryParser for LabelListQueryParser {
if args.len() != 2 {
return None;
}
// DataFusion normalizes array_contains to array_has
if func.name() == "array_has" {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this work for array_contains? Is Datafusion mapping that to array_has already?

If so, can we add a comment here mentioning that this branch is also going to be hit for array_contains?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comment

let inner_type = match data_type {
DataType::List(field) | DataType::LargeList(field) => field.data_type(),
_ => return None,
};
let scalar = maybe_scalar(&args[1], inner_type)?;
// array_has(..., NULL) returns no matches in datafusion, but the index would
// match rows containing NULL. Fallback to match datafusion behavior.
if scalar.is_null() {
return None;
}
let query = LabelListQuery::HasAnyLabel(vec![scalar]);
return Some(IndexedExpression::index_query(
column.to_string(),
self.index_name.clone(),
Arc::new(query),
));
}

let label_list = maybe_scalar(&args[1], data_type)?;
if let ScalarValue::List(list_arr) = label_list {
let list_values = list_arr.values();
Expand Down Expand Up @@ -1651,6 +1671,7 @@ fn visit_node(
}
match expr {
Expr::Between(between) => Ok(visit_between(between, index_info)),
Expr::Alias(alias) => visit_node(alias.expr.as_ref(), index_info, depth),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch

Expr::Column(_) => Ok(visit_column(expr, index_info)),
Expr::InList(in_list) => Ok(visit_in_list(in_list, index_info)),
Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)),
Expand Down
4 changes: 2 additions & 2 deletions rust/lance-index/src/scalar/label_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ trait LabelListSubIndex: ScalarIndex + DeepSizeOf {
impl<T: ScalarIndex + DeepSizeOf> LabelListSubIndex for T {}

/// A scalar index that can be used on `List<T>` columns to
/// support queries with array_contains_all and array_contains_any
/// using an underlying bitmap index.
/// accelerate list membership filters such as `array_has_all`, `array_has_any`,
/// and `array_has` / `array_contains`, using an underlying bitmap index.
#[derive(Clone, Debug, DeepSizeOf)]
pub struct LabelListIndex {
values_index: Arc<dyn LabelListSubIndex>,
Expand Down
Loading