Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor NullBufferBuilder to use an internal state machine #7082

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
/// Return the allocated size of this builder in bytes, useful for memory accounting.
pub fn allocated_size(&self) -> usize {
let views = self.views_builder.capacity() * std::mem::size_of::<u128>();
let null = self.null_buffer_builder.allocated_size();
let null = self.null_buffer_builder.allocated_size() / 8;
let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
let in_progress = self.in_progress.capacity();
let tracker = match &self.string_tracker {
Expand Down
2 changes: 1 addition & 1 deletion arrow-buffer/src/builder/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ impl BooleanBufferBuilder {
self.len == 0
}

/// Returns the capacity of the buffer
/// Returns the capacity of the buffer (in bits)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (and accompanying changes to how this is handled by callers) is technically unrelated to original intent; just something I picked up, can probably be extracted to a separate PR

#[inline]
pub fn capacity(&self) -> usize {
self.buffer.capacity() * 8
Expand Down
162 changes: 103 additions & 59 deletions arrow-buffer/src/builder/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@

use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer};

#[derive(Debug)]
enum NullBufferBuilderState {
/// All values are non-null (`true`)
Unmaterialized { len: usize, capacity: usize },
/// Materialized an actual [`BooleanBufferBuilder`], usually when encountering first null (`false`)
Materialized {
bitmap_builder: BooleanBufferBuilder,
},
}

/// Builder for creating [`NullBuffer`]
///
/// # Performance
Expand Down Expand Up @@ -48,16 +58,8 @@ use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer};
/// ```
#[derive(Debug)]
pub struct NullBufferBuilder {
/// The bitmap builder to store the null buffer:
/// * `Some` if any nulls have been appended ("materialized")
/// * `None` if no nulls have been appended.
bitmap_builder: Option<BooleanBufferBuilder>,
/// Length of the buffer before materializing.
///
/// if `bitmap_buffer` buffer is `Some`, this value is not used.
len: usize,
/// Initial capacity of the `bitmap_builder`, when it is materialized.
capacity: usize,
state: NullBufferBuilderState,
initial_capacity: usize,
}

impl NullBufferBuilder {
Expand All @@ -68,18 +70,16 @@ impl NullBufferBuilder {
/// size in bits (not bytes) that will be allocated at minimum.
pub fn new(capacity: usize) -> Self {
Self {
bitmap_builder: None,
len: 0,
capacity,
state: NullBufferBuilderState::Unmaterialized { len: 0, capacity },
initial_capacity: capacity,
}
}

/// Creates a new builder with given length.
pub fn new_with_len(len: usize) -> Self {
Self {
bitmap_builder: None,
len,
capacity: len,
state: NullBufferBuilderState::Unmaterialized { len, capacity: len },
initial_capacity: len,
}
}

Expand All @@ -88,33 +88,32 @@ impl NullBufferBuilder {
let capacity = buffer.len() * 8;
assert!(len <= capacity);

let bitmap_builder = Some(BooleanBufferBuilder::new_from_buffer(buffer, len));
let bitmap_builder = BooleanBufferBuilder::new_from_buffer(buffer, len);
Self {
bitmap_builder,
len,
capacity,
state: NullBufferBuilderState::Materialized { bitmap_builder },
initial_capacity: capacity,
}
}

/// Appends `n` `true`s into the builder
/// to indicate that these `n` items are not nulls.
#[inline]
pub fn append_n_non_nulls(&mut self, n: usize) {
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.append_n(n, true)
} else {
self.len += n;
match &mut self.state {
NullBufferBuilderState::Unmaterialized { len, capacity: _ } => *len += n,
NullBufferBuilderState::Materialized { bitmap_builder } => {
bitmap_builder.append_n(n, true)
}
}
}

/// Appends a `true` into the builder
/// to indicate that this item is not null.
#[inline]
pub fn append_non_null(&mut self) {
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.append(true)
} else {
self.len += 1;
match &mut self.state {
NullBufferBuilderState::Unmaterialized { len, capacity: _ } => *len += 1,
NullBufferBuilderState::Materialized { bitmap_builder } => bitmap_builder.append(true),
}
}

Expand All @@ -123,15 +122,23 @@ impl NullBufferBuilder {
#[inline]
pub fn append_n_nulls(&mut self, n: usize) {
self.materialize_if_needed();
self.bitmap_builder.as_mut().unwrap().append_n(n, false);
if let NullBufferBuilderState::Materialized { bitmap_builder } = &mut self.state {
bitmap_builder.append_n(n, false);
} else {
unreachable!("Materialization should have occurred by now")
}
}

/// Appends a `false` into the builder
/// to indicate that this item is null.
#[inline]
pub fn append_null(&mut self) {
self.materialize_if_needed();
self.bitmap_builder.as_mut().unwrap().append(false);
if let NullBufferBuilderState::Materialized { bitmap_builder } = &mut self.state {
bitmap_builder.append(false);
} else {
unreachable!("Materialization should have occurred by now")
}
}

/// Appends a boolean value into the builder.
Expand All @@ -147,10 +154,11 @@ impl NullBufferBuilder {
/// Gets a bit in the buffer at `index`
#[inline]
pub fn is_valid(&self, index: usize) -> bool {
if let Some(ref buf) = self.bitmap_builder {
buf.get_bit(index)
} else {
true
match &self.state {
NullBufferBuilderState::Unmaterialized { .. } => true,
NullBufferBuilderState::Materialized { bitmap_builder } => {
bitmap_builder.get_bit(index)
}
}
}

Expand All @@ -159,10 +167,12 @@ impl NullBufferBuilder {
/// If `len` is greater than the buffer's current length, this has no effect
#[inline]
pub fn truncate(&mut self, len: usize) {
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.truncate(len);
} else if len <= self.len {
self.len = len
match &mut self.state {
NullBufferBuilderState::Unmaterialized { len: state_len, .. } if len <= *state_len => {
*state_len = len
}
NullBufferBuilderState::Materialized { bitmap_builder } => bitmap_builder.truncate(len),
_ => (),
}
}

Expand All @@ -172,64 +182,98 @@ impl NullBufferBuilder {
if slice.iter().any(|v| !v) {
self.materialize_if_needed()
}
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.append_slice(slice)
} else {
self.len += slice.len();
match &mut self.state {
NullBufferBuilderState::Unmaterialized { len, capacity: _ } => *len += slice.len(),
NullBufferBuilderState::Materialized { bitmap_builder } => {
bitmap_builder.append_slice(slice)
}
}
}

/// Builds the null buffer and resets the builder.
/// Returns `None` if the builder only contains `true`s.
pub fn finish(&mut self) -> Option<NullBuffer> {
self.len = 0;
Some(NullBuffer::new(self.bitmap_builder.take()?.finish()))
match &mut self.state {
NullBufferBuilderState::Unmaterialized { len, capacity: _ } => {
*len = 0;
None
}
NullBufferBuilderState::Materialized { bitmap_builder } => {
let boolean_buffer = bitmap_builder.finish();
self.state = NullBufferBuilderState::Unmaterialized {
len: 0,
// TODO: Only usage of self.initial_capacity here. Should we be using it,
// or instead should use bitmap_builder.capacity()? Considering
// self.initial_capacity <= bitmap_builder.capacity()
capacity: self.initial_capacity,
Comment on lines +205 to +208
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to preserve the current behaviour of finish() where it resets to the original capacity provided. This seems wrong to me as ideally would want to reset to the current capacity, which may have increased since it was originally specified.

};
Some(NullBuffer::new(boolean_buffer))
}
}
}

/// Builds the [NullBuffer] without resetting the builder.
pub fn finish_cloned(&self) -> Option<NullBuffer> {
let buffer = self.bitmap_builder.as_ref()?.finish_cloned();
Some(NullBuffer::new(buffer))
match &self.state {
NullBufferBuilderState::Unmaterialized { .. } => None,
NullBufferBuilderState::Materialized { bitmap_builder } => {
let buffer = bitmap_builder.finish_cloned();
Some(NullBuffer::new(buffer))
}
}
}

/// Returns the inner bitmap builder as slice
pub fn as_slice(&self) -> Option<&[u8]> {
Some(self.bitmap_builder.as_ref()?.as_slice())
match &self.state {
NullBufferBuilderState::Unmaterialized { .. } => None,
NullBufferBuilderState::Materialized { bitmap_builder } => {
Some(bitmap_builder.as_slice())
}
}
}

fn materialize_if_needed(&mut self) {
if self.bitmap_builder.is_none() {
if let NullBufferBuilderState::Unmaterialized { .. } = self.state {
self.materialize()
}
}

#[cold]
fn materialize(&mut self) {
if self.bitmap_builder.is_none() {
let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity));
b.append_n(self.len, true);
self.bitmap_builder = Some(b);
if let NullBufferBuilderState::Unmaterialized { len, capacity } = self.state {
let mut bitmap_builder = BooleanBufferBuilder::new(len.max(capacity));
bitmap_builder.append_n(len, true);
self.state = NullBufferBuilderState::Materialized { bitmap_builder }
}
}

/// Return a mutable reference to the inner bitmap slice.
pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> {
self.bitmap_builder.as_mut().map(|b| b.as_slice_mut())
match &mut self.state {
NullBufferBuilderState::Unmaterialized { .. } => None,
NullBufferBuilderState::Materialized { bitmap_builder } => {
Some(bitmap_builder.as_slice_mut())
}
}
}

/// Return the allocated size of this builder, in bytes, useful for memory accounting.
/// Return the allocated size of this builder, in bits, useful for memory accounting.
pub fn allocated_size(&self) -> usize {
self.bitmap_builder
.as_ref()
.map(|b| b.capacity())
.unwrap_or(0)
match &self.state {
NullBufferBuilderState::Unmaterialized { .. } => 0,
NullBufferBuilderState::Materialized { bitmap_builder } => bitmap_builder.capacity(),
}
}
}

impl NullBufferBuilder {
/// Return the number of bits in the buffer.
pub fn len(&self) -> usize {
self.bitmap_builder.as_ref().map_or(self.len, |b| b.len())
match &self.state {
NullBufferBuilderState::Unmaterialized { len, capacity: _ } => *len,
NullBufferBuilderState::Materialized { bitmap_builder } => bitmap_builder.len(),
}
}

/// Check if the builder is empty.
Expand Down
Loading