Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve

## [Unreleased]

### Added

- `lance_vector_search` now supports the `nprobs` and `refine_factor` parameters.

## [0.4.0] - 2025-12-29

### Added
Expand Down
39 changes: 3 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,18 +142,7 @@ FROM lance_vector_search('path/to/dataset.lance', 'vec', [0.1, 0.2, 0.3, 0.4]::F
ORDER BY _distance ASC;
```

- Signature: `lance_vector_search(uri, vector_column, query_vector, ...)`
- Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `vector_column` (VARCHAR): Vector column name.
- `query_vector` (FLOAT[dim] or DOUBLE[dim], preferred): Query vector (must be non-empty; values are cast to float32). `FLOAT[]` / `DOUBLE[]` are also accepted.
- Named parameters:
- `k` (BIGINT, default `10`): Number of results to return.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.
- `use_index` (BOOLEAN, default `true`): If `true`, allow ANN index usage when available.
- `explain_verbose` (BOOLEAN, default `false`): Emit a more verbose Lance plan in `EXPLAIN` output.
- Output:
- Dataset columns plus `_distance` (smaller is closer).
See the SQL reference for full parameter documentation: [docs/sql.md#search](docs/sql.md#search).

### Full-text search (FTS)

Expand All @@ -164,16 +153,7 @@ FROM lance_fts('path/to/dataset.lance', 'text', 'puppy', k = 10, prefilter = tru
ORDER BY _score DESC;
```

- Signature: `lance_fts(uri, text_column, query, ...)`
- Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `text_column` (VARCHAR): Text column name.
- `query` (VARCHAR): Query string.
- Named parameters:
- `k` (BIGINT, default `10`): Number of results to return.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.
- Output:
- Dataset columns plus `_score` (larger is better).
See the SQL reference for full parameter documentation: [docs/sql.md#search](docs/sql.md#search).

### Hybrid search (vector + FTS)

Expand All @@ -188,20 +168,7 @@ FROM lance_hybrid_search('path/to/dataset.lance',
ORDER BY _hybrid_score DESC;
```

- Signature: `lance_hybrid_search(uri, vector_column, query_vector, text_column, query, ...)`
- Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `vector_column` (VARCHAR): Vector column name.
- `query_vector` (FLOAT[dim] or DOUBLE[dim], preferred): Query vector (must be non-empty; values are cast to float32). `FLOAT[]` / `DOUBLE[]` are also accepted.
- `text_column` (VARCHAR): Text column name.
- `query` (VARCHAR): Query string.
- Named parameters:
- `k` (BIGINT, default `10`): Number of results to return.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.
- `alpha` (FLOAT, default `0.5`): Vector/text mixing weight.
- `oversample_factor` (INTEGER, default `4`): Oversample factor for candidate generation (larger can improve recall at higher cost).
- Output:
- Dataset columns plus `_hybrid_score` (larger is better), `_distance`, and `_score`.
See the SQL reference for full parameter documentation: [docs/sql.md#search](docs/sql.md#search).

## Contributing

Expand Down
110 changes: 110 additions & 0 deletions docs/sql.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,116 @@ FROM 'path/to/dataset.lance'
LIMIT 10;
```

## Search

### Vector search: `lance_vector_search`

```sql
-- Search a vector column, returning distances in `_distance` (smaller is closer)
SELECT id, label, _distance
FROM lance_vector_search(
'path/to/dataset.lance',
'vec',
[0.1, 0.2, 0.3, 0.4]::FLOAT[4],
k = 5,
use_index = true,
nprobs = 4,
refine_factor = 2,
prefilter = true
)
ORDER BY _distance ASC;
```

Signature: `lance_vector_search(uri, vector_column, query_vector, ...)`

Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `vector_column` (VARCHAR): Vector column name.
- `query_vector` (FLOAT[dim] or DOUBLE[dim], preferred): Query vector (must be non-empty; values are cast to float32). `FLOAT[]` / `DOUBLE[]` are also accepted.

Named parameters:
- `k` (BIGINT, default `10`): Number of results to return. Must be > 0.
- `use_index` (BOOLEAN, default `true`): If `true`, allow ANN index usage when available.
- `nprobs` (BIGINT, optional): Number of IVF partitions to probe when using a vector index. Must be > 0. Only affects IVF-based vector indices.
- `refine_factor` (BIGINT, optional): Over-fetch factor for re-ranking using original vectors. Must be > 0. A value of `1` still enables re-ranking.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.
- `explain_verbose` (BOOLEAN, default `false`): Emit a more verbose Lance plan in `EXPLAIN` output.

Output:
- Dataset columns plus `_distance` (smaller is closer).

Filter semantics:
- If `prefilter=false`, filter pushdown is best-effort. If pushdown fails, the query is retried without pushed filters and DuckDB applies filters for correctness.
- If `prefilter=true`, prefilterable filters must be pushed down, otherwise the query fails with an error.

### Full-text search: `lance_fts`

```sql
-- Search a text column, returning BM25-like scores in `_score` (larger is better)
SELECT id, text, _score
FROM lance_fts('path/to/dataset.lance', 'text', 'puppy', k = 10, prefilter = true)
ORDER BY _score DESC;
```

Signature: `lance_fts(uri, text_column, query, ...)`

Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `text_column` (VARCHAR): Text column name.
- `query` (VARCHAR): Query string.

Named parameters:
- `k` (BIGINT, default `10`): Number of results to return. Must be > 0.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.

Output:
- Dataset columns plus `_score` (larger is better).

Filter semantics:
- If `prefilter=false`, filter pushdown is best-effort. If pushdown fails, the query is retried without pushed filters and DuckDB applies filters for correctness.
- If `prefilter=true`, prefilterable filters must be pushed down, otherwise the query fails with an error.

### Hybrid search: `lance_hybrid_search`

```sql
-- Combine vector and text scores, returning `_hybrid_score` (larger is better)
SELECT id, _hybrid_score, _distance, _score
FROM lance_hybrid_search(
'path/to/dataset.lance',
'vec',
[0.1, 0.2, 0.3, 0.4]::FLOAT[4],
'text',
'puppy',
k = 10,
prefilter = false,
alpha = 0.5,
oversample_factor = 4
)
ORDER BY _hybrid_score DESC;
```

Signature: `lance_hybrid_search(uri, vector_column, query_vector, text_column, query, ...)`

Positional arguments:
- `uri` (VARCHAR): Dataset root path or object store URI (e.g. `s3://...`).
- `vector_column` (VARCHAR): Vector column name.
- `query_vector` (FLOAT[dim] or DOUBLE[dim], preferred): Query vector (must be non-empty; values are cast to float32). `FLOAT[]` / `DOUBLE[]` are also accepted.
- `text_column` (VARCHAR): Text column name.
- `query` (VARCHAR): Query string.

Named parameters:
- `k` (BIGINT, default `10`): Number of results to return. Must be > 0.
- `prefilter` (BOOLEAN, default `false`): If `true`, filters are applied before top-k selection.
- `alpha` (FLOAT, default `0.5`): Vector/text mixing weight. Larger values weigh vector similarity more heavily.
- `oversample_factor` (INTEGER, default `4`): Oversample factor for candidate generation. If provided, must be > 0.

Output:
- Dataset columns plus `_hybrid_score` (larger is better), `_distance`, and `_score`.

Filter semantics:
- If `prefilter=false`, filter pushdown is best-effort. If pushdown fails, the query is retried without pushed filters and DuckDB applies filters for correctness.
- If `prefilter=true`, prefilterable filters must be pushed down, otherwise the query fails with an error.

## Namespaces

Namespaces let you treat a directory (or a remote namespace service) as a database catalog and access datasets as tables.
Expand Down
57 changes: 57 additions & 0 deletions rust/ffi/knn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub unsafe extern "C" fn lance_get_knn_schema(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
prefilter: u8,
use_index: u8,
) -> *mut c_void {
Expand All @@ -30,6 +32,8 @@ pub unsafe extern "C" fn lance_get_knn_schema(
query_values,
query_len,
k,
nprobes,
refine_factor,
prefilter,
use_index,
) {
Expand All @@ -50,6 +54,8 @@ fn get_knn_schema_inner(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
prefilter: u8,
use_index: u8,
) -> FfiResult<SchemaHandle> {
Expand All @@ -71,6 +77,19 @@ fn get_knn_schema_inner(
let query = Float32Array::from_iter_values(query_values.iter().copied());
scan.nearest(vector_column, &query, k_usize)
.map_err(|err| FfiError::new(ErrorCode::KnnSchema, format!("knn schema nearest: {err}")))?;
if nprobes != 0 {
let nprobes_usize = nonzero_u64_to_usize(nprobes, "nprobes")?;
scan.nprobes(nprobes_usize);
}
if refine_factor != 0 {
let refine_factor_u32: u32 = refine_factor.try_into().map_err(|_| {
FfiError::new(
ErrorCode::InvalidArgument,
"refine_factor must fit in u32",
)
})?;
scan.refine(refine_factor_u32);
}
scan.use_index(use_index != 0);
scan.disable_scoring_autoprojection();
scan.project(projection.as_ref())
Expand All @@ -90,6 +109,8 @@ pub unsafe extern "C" fn lance_create_knn_stream_ir(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
filter_ir: *const u8,
filter_ir_len: usize,
prefilter: u8,
Expand All @@ -101,6 +122,8 @@ pub unsafe extern "C" fn lance_create_knn_stream_ir(
query_values,
query_len,
k,
nprobes,
refine_factor,
filter_ir,
filter_ir_len,
prefilter,
Expand All @@ -124,6 +147,8 @@ fn create_knn_stream_ir_inner(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
filter_ir: *const u8,
filter_ir_len: usize,
prefilter: u8,
Expand Down Expand Up @@ -163,6 +188,19 @@ fn create_knn_stream_ir_inner(
format!("knn scan nearest: {err}"),
)
})?;
if nprobes != 0 {
let nprobes_usize = nonzero_u64_to_usize(nprobes, "nprobes")?;
scan.nprobes(nprobes_usize);
}
if refine_factor != 0 {
let refine_factor_u32: u32 = refine_factor.try_into().map_err(|_| {
FfiError::new(
ErrorCode::InvalidArgument,
"refine_factor must fit in u32",
)
})?;
scan.refine(refine_factor_u32);
}
scan.use_index(use_index != 0);
scan.disable_scoring_autoprojection();
scan.project(projection.as_ref()).map_err(|err| {
Expand All @@ -189,6 +227,8 @@ pub unsafe extern "C" fn lance_explain_knn_scan_ir(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
filter_ir: *const u8,
filter_ir_len: usize,
prefilter: u8,
Expand All @@ -201,6 +241,8 @@ pub unsafe extern "C" fn lance_explain_knn_scan_ir(
query_values,
query_len,
k,
nprobes,
refine_factor,
filter_ir,
filter_ir_len,
prefilter,
Expand All @@ -225,6 +267,8 @@ fn explain_knn_scan_ir_inner(
query_values: *const f32,
query_len: usize,
k: u64,
nprobes: u64,
refine_factor: u64,
filter_ir: *const u8,
filter_ir_len: usize,
prefilter: u8,
Expand Down Expand Up @@ -260,6 +304,19 @@ fn explain_knn_scan_ir_inner(
let query = Float32Array::from_iter_values(query_values.iter().copied());
scan.nearest(vector_column, &query, k_usize)
.map_err(|err| FfiError::new(ErrorCode::ExplainPlan, format!("knn scan nearest: {err}")))?;
if nprobes != 0 {
let nprobes_usize = nonzero_u64_to_usize(nprobes, "nprobes")?;
scan.nprobes(nprobes_usize);
}
if refine_factor != 0 {
let refine_factor_u32: u32 = refine_factor.try_into().map_err(|_| {
FfiError::new(
ErrorCode::InvalidArgument,
"refine_factor must fit in u32",
)
})?;
scan.refine(refine_factor_u32);
}
scan.use_index(use_index != 0);
scan.disable_scoring_autoprojection();
scan.project(projection.as_ref())
Expand Down
11 changes: 7 additions & 4 deletions src/include/lance_ffi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,19 @@ const char *lance_explain_dataset_scan_ir(void *dataset, const char **columns,

void *lance_get_knn_schema(void *dataset, const char *vector_column,
const float *query_values, size_t query_len,
uint64_t k, uint8_t prefilter, uint8_t use_index);
uint64_t k, uint64_t nprobes, uint64_t refine_factor,
uint8_t prefilter, uint8_t use_index);
void *lance_create_knn_stream_ir(void *dataset, const char *vector_column,
const float *query_values, size_t query_len,
uint64_t k, const uint8_t *filter_ir,
size_t filter_ir_len, uint8_t prefilter,
uint8_t use_index);
uint64_t k, uint64_t nprobes,
uint64_t refine_factor,
const uint8_t *filter_ir, size_t filter_ir_len,
uint8_t prefilter, uint8_t use_index);

const char *lance_explain_knn_scan_ir(void *dataset, const char *vector_column,
const float *query_values,
size_t query_len, uint64_t k,
uint64_t nprobes, uint64_t refine_factor,
const uint8_t *filter_ir,
size_t filter_ir_len, uint8_t prefilter,
uint8_t use_index, uint8_t verbose);
Expand Down
Loading