Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,6 @@ redundant_pub_crate = "deny"
string_add_assign = "deny"
string_add = "deny"
string_lit_as_bytes = "deny"
string_to_string = "deny"
use_self = "deny"
dbg_macro = "deny"
trait_duplication_in_bounds = "deny"
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def can_write(data, dataset, schema=None):
lance.write_dataset(pa.table(data, schema=schema), dataset.uri, mode="append")

def cannot_write(data, dataset, schema=None):
with pytest.raises(Exception, match="contained null values"):
with pytest.raises(Exception, match=r"contain(ed|s) null values"):
can_write(data, dataset, schema)

nullable_dataset = lance.write_dataset(
Expand Down
1 change: 0 additions & 1 deletion rust/compression/bitpacking/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

use arrayref::{array_mut_ref, array_ref};
use core::mem::size_of;
use paste::paste;

pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];

Expand Down
17 changes: 8 additions & 9 deletions rust/examples/src/hnsw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,14 @@ async fn main() {
let max_level = 7;

// 1. Generate a synthetic test data of specified dimensions
let dataset = if uri.is_none() {
println!("No uri is provided, generating test dataset...");
let output = "test_vectors.lance";
create_test_vector_dataset(output, 1000, 64).await;
Dataset::open(output).await.expect("Failed to open dataset")
} else {
Dataset::open(uri.as_ref().unwrap())
.await
.expect("Failed to open dataset")
let dataset = match uri.as_deref() {
None => {
println!("No uri is provided, generating test dataset...");
let output = "test_vectors.lance";
create_test_vector_dataset(output, 1000, 64).await;
Dataset::open(output).await.expect("Failed to open dataset")
}
Some(uri) => Dataset::open(uri).await.expect("Failed to open dataset"),
};

println!("Dataset schema: {:#?}", dataset.schema());
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ pub struct Field {
pub encoding: Option<Encoding>,
pub nullable: bool,

pub children: Vec<Field>,
pub children: Vec<Self>,

/// Dictionary value array if this field is dictionary.
pub dictionary: Option<Dictionary>,
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1533,7 +1533,7 @@ pub fn parse_field_path(path: &str) -> Result<Vec<String>> {
/// For example: ["parent", "child.with.dot"] formats to "parent.`child.with.dot`"
/// For example: ["meta-data", "user-id"] formats to "`meta-data`.`user-id`"
/// Backticks in field names are escaped by doubling them.
/// For example: ["field`with`backticks"] formats to "`field``with``backticks`"
/// For example: \["field`with`backticks"\] formats to "`field``with``backticks`"
pub fn format_field_path(fields: &[&str]) -> String {
fields
.iter()
Expand Down
9 changes: 2 additions & 7 deletions rust/lance-core/src/utils/deletion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ const BITMAP_THRESDHOLD: usize = 5_000;
// TODO: Benchmark to find a better value.

/// Represents a set of deleted row offsets in a single fragment.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub enum DeletionVector {
#[default]
NoDeletions,
Set(HashSet<u32>),
Bitmap(RoaringBitmap),
Expand Down Expand Up @@ -182,12 +183,6 @@ impl OffsetMapper {
}
}

impl Default for DeletionVector {
fn default() -> Self {
Self::NoDeletions
}
}

impl From<&DeletionVector> for RoaringBitmap {
fn from(value: &DeletionVector) -> Self {
match value {
Expand Down
22 changes: 14 additions & 8 deletions rust/lance-datafusion/src/substrait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,17 @@ fn remove_extension_types(
for (substrait_field, arrow_field) in fields.types.iter().zip(arrow_schema.fields.iter()) {
let num_fields = count_fields(substrait_field);

let kind = substrait_field.kind.as_ref().unwrap();
let is_user_defined = match kind {
Kind::UserDefined(_) => true,
// Keep compatibility with older Substrait plans.
#[allow(deprecated)]
Kind::UserDefinedTypeReference(_) => true,
_ => false,
};

if !substrait_schema.names[field_index].starts_with("__unlikely_name_placeholder")
&& !matches!(
substrait_field.kind.as_ref().unwrap(),
Kind::UserDefined(_) | Kind::UserDefinedTypeReference(_)
)
&& !is_user_defined
{
kept_substrait_fields.push(substrait_field.clone());
kept_arrow_fields.push(arrow_field.clone());
Expand Down Expand Up @@ -118,10 +124,10 @@ fn remove_extension_types(
fn remap_expr_references(expr: &mut Expression, mapping: &HashMap<usize, usize>) -> Result<()> {
match expr.rex_type.as_mut().unwrap() {
// Simple, no field references possible
RexType::Literal(_)
| RexType::Nested(_)
| RexType::Enum(_)
| RexType::DynamicParameter(_) => Ok(()),
RexType::Literal(_) | RexType::Nested(_) | RexType::DynamicParameter(_) => Ok(()),
// Enum literals are deprecated in Substrait and should only appear in older plans.
#[allow(deprecated)]
RexType::Enum(_) => Ok(()),
// Complex operators not supported in filters
RexType::WindowFunction(_) | RexType::Subquery(_) => Err(Error::invalid_input(
"Window functions or subqueries not allowed in filter expression",
Expand Down
56 changes: 30 additions & 26 deletions rust/lance-index/src/scalar/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1755,19 +1755,21 @@ pub async fn train_btree_index(
Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false),
]));

let mut sub_index_file;
if partition_id.is_none() {
sub_index_file = index_store
.new_index_file(BTREE_PAGES_NAME, flat_schema.clone())
.await?;
} else {
sub_index_file = index_store
.new_index_file(
part_page_data_file_path(partition_id.unwrap()).as_str(),
flat_schema.clone(),
)
.await?;
}
let mut sub_index_file = match partition_id {
None => {
index_store
.new_index_file(BTREE_PAGES_NAME, flat_schema.clone())
.await?
}
Some(partition_id) => {
index_store
.new_index_file(
part_page_data_file_path(partition_id).as_str(),
flat_schema.clone(),
)
.await?
}
};

let mut encoded_batches = Vec::new();
let mut batch_idx = 0;
Expand Down Expand Up @@ -1802,19 +1804,21 @@ pub async fn train_btree_index(
RANGE_PARTITIONED_META_KEY.to_string(),
range_id.is_some().to_string(),
);
let mut btree_index_file;
if partition_id.is_none() {
btree_index_file = index_store
.new_index_file(BTREE_LOOKUP_NAME, Arc::new(file_schema))
.await?;
} else {
btree_index_file = index_store
.new_index_file(
part_lookup_file_path(partition_id.unwrap()).as_str(),
Arc::new(file_schema),
)
.await?;
}
let mut btree_index_file = match partition_id {
None => {
index_store
.new_index_file(BTREE_LOOKUP_NAME, Arc::new(file_schema))
.await?
}
Some(partition_id) => {
index_store
.new_index_file(
part_lookup_file_path(partition_id).as_str(),
Arc::new(file_schema),
)
.await?
}
};
btree_index_file.write_record_batch(record_batch).await?;
btree_index_file.finish().await?;
Ok(())
Expand Down
6 changes: 3 additions & 3 deletions rust/lance-index/src/scalar/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -965,9 +965,9 @@ impl PartialEq for ScalarIndexSearch {
/// modify the results of scalar lookups
#[derive(Debug, Clone)]
pub enum ScalarIndexExpr {
Not(Box<ScalarIndexExpr>),
And(Box<ScalarIndexExpr>, Box<ScalarIndexExpr>),
Or(Box<ScalarIndexExpr>, Box<ScalarIndexExpr>),
Not(Box<Self>),
And(Box<Self>, Box<Self>),
Or(Box<Self>, Box<Self>),
Query(ScalarIndexSearch),
}

Expand Down
9 changes: 2 additions & 7 deletions rust/lance-index/src/scalar/inverted/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,13 @@ impl Default for FtsSearchParams {
}
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)]
pub enum Operator {
And,
#[default]
Or,
}

impl Default for Operator {
fn default() -> Self {
Self::Or
}
}

impl TryFrom<&str> for Operator {
type Error = Error;
fn try_from(value: &str) -> Result<Self> {
Expand Down
10 changes: 2 additions & 8 deletions rust/lance-io/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ pub mod utils;
pub use scheduler::{bytes_read_counter, iops_counter};

/// Defines a selection of rows to read from a file/batch
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Default)]
pub enum ReadBatchParams {
/// Select a contiguous range of rows
Range(Range<usize>),
/// Select multiple contiguous ranges of rows
Ranges(Arc<[Range<u64>]>),
/// Select all rows (this is the default)
#[default]
RangeFull,
/// Select all rows up to a given index
RangeTo(RangeTo<usize>),
Expand Down Expand Up @@ -77,13 +78,6 @@ impl std::fmt::Display for ReadBatchParams {
}
}

impl Default for ReadBatchParams {
fn default() -> Self {
// Default of ReadBatchParams is reading the full batch.
Self::RangeFull
}
}

impl From<&[u32]> for ReadBatchParams {
fn from(value: &[u32]) -> Self {
Self::Indices(UInt32Array::from_iter_values(value.iter().copied()))
Expand Down
2 changes: 1 addition & 1 deletion rust/lance/src/dataset/write/merge_insert/inserted_rows.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pub enum KeyValue {
Int64(i64),
UInt64(u64),
Binary(Vec<u8>),
Composite(Vec<KeyValue>),
Composite(Vec<Self>),
}

impl KeyValue {
Expand Down
25 changes: 13 additions & 12 deletions rust/lance/src/index/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,18 +272,19 @@ pub(super) async fn build_scalar_index(
let training_request =
plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?;

let training_data = if preprocessed_data.is_none() {
load_training_data(
dataset,
column,
training_request.criteria(),
None,
train,
fragment_ids.clone(),
)
.await?
} else {
preprocessed_data.unwrap()
let training_data = match preprocessed_data {
Some(preprocessed_data) => preprocessed_data,
None => {
load_training_data(
dataset,
column,
training_request.criteria(),
None,
train,
fragment_ids.clone(),
)
.await?
}
};

plugin
Expand Down
5 changes: 2 additions & 3 deletions rust/lance/src/index/vector/ivf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1231,8 +1231,7 @@ pub async fn build_ivf_model(
) -> Result<IvfModel> {
let num_partitions = params.num_partitions.unwrap();
let centroids = params.centroids.clone();
if centroids.is_some() && !params.retrain {
let centroids = centroids.unwrap();
if let (Some(centroids), false) = (centroids.as_deref(), params.retrain) {
info!("Pre-computed IVF centroids is provided, skip IVF training");
if centroids.values().len() != num_partitions * dim {
return Err(Error::Index {
Expand All @@ -1244,7 +1243,7 @@ pub async fn build_ivf_model(
location: location!(),
});
}
return Ok(IvfModel::new(centroids.as_ref().clone(), None));
return Ok(IvfModel::new(centroids.clone(), None));
}
let sample_size_hint = num_partitions * params.sample_rate;

Expand Down
2 changes: 1 addition & 1 deletion rust/lance/src/io/exec/projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ pub enum Selection<'a> {
/// Selects this fields and all subfields
FullField(&'a str),
/// For a struct, selections of subfields
StructProjection(&'a str, Vec<Selection<'a>>),
StructProjection(&'a str, Vec<Self>),
}

impl Selection<'_> {
Expand Down
Loading