Skip to content

Commit 83ee393

Browse files
committed
Migrate from Databend to jsonb-rs
1 parent 1605dd0 commit 83ee393

File tree

22 files changed

+4528
-0
lines changed

22 files changed

+4528
-0
lines changed

.github/workflows/rust.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: Rust
2+
3+
on:
4+
push:
5+
branches: [ "main" ]
6+
pull_request:
7+
branches: [ "main" ]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
12+
jobs:
13+
build:
14+
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
- name: Build
20+
run: cargo build --verbose
21+
- name: Run tests
22+
run: cargo test --verbose

.gitignore

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,14 @@ Cargo.lock
88

99
# These are backup files generated by rustfmt
1010
**/*.rs.bk
11+
12+
# IDE and editor
13+
.vscode
14+
.idea
15+
16+
# env files for backends
17+
.env
18+
19+
# profiling
20+
flamegraph.svg
21+
perf.*

Cargo.toml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2023 Datafuse Labs
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
[package]
16+
authors = ["Databend Authors <[email protected]>"]
17+
categories = ["encoding"]
18+
description = "JSONB implement in Rust."
19+
edition = "2021"
20+
homepage = "https://github.com/datafuselabs/jsonb-rs"
21+
keywords = ["json", "jsonb"]
22+
license = "Apache-2.0"
23+
name = "jsonb-rs"
24+
repository = "https://github.com/datafuselabs/jsonb-rs"
25+
version = "0.1.0"
26+
rust-version = "1.60"
27+
28+
[dependencies]
29+
byteorder = "1.4.3"
30+
fast-float = "0.2.0"
31+
ordered-float = { version = "3.4.0", default-features = false }
32+
serde = { version = "1.0.145", features = ["derive", "rc"] }
33+
serde_json = { version = "1.0.85", default-features = false, features = ["preserve_order"] }

rust-toolchain.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[toolchain]
2+
channel = "nightly-2022-12-15"
3+
components = ["rustfmt", "clippy", "rust-src", "miri"]

src/constants.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Copyright 2023 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// JSONB header constants
16+
pub(crate) const ARRAY_PREFIX: u8 = 0x80;
17+
pub(crate) const OBJECT_PREFIX: u8 = 0x40;
18+
pub(crate) const SCALAR_PREFIX: u8 = 0x20;
19+
20+
pub(crate) const ARRAY_CONTAINER_TAG: u32 = 0x80000000;
21+
pub(crate) const OBJECT_CONTAINER_TAG: u32 = 0x40000000;
22+
pub(crate) const SCALAR_CONTAINER_TAG: u32 = 0x20000000;
23+
24+
pub(crate) const CONTAINER_HEADER_TYPE_MASK: u32 = 0xE0000000;
25+
pub(crate) const CONTAINER_HEADER_LEN_MASK: u32 = 0x1FFFFFFF;
26+
27+
// JSONB JEntry constants
28+
pub(crate) const NULL_TAG: u32 = 0x00000000;
29+
pub(crate) const STRING_TAG: u32 = 0x10000000;
30+
pub(crate) const NUMBER_TAG: u32 = 0x20000000;
31+
pub(crate) const FALSE_TAG: u32 = 0x30000000;
32+
pub(crate) const TRUE_TAG: u32 = 0x40000000;
33+
pub(crate) const CONTAINER_TAG: u32 = 0x50000000;
34+
35+
// JSONB number constants
36+
pub(crate) const NUMBER_ZERO: u8 = 0x00;
37+
pub(crate) const NUMBER_NAN: u8 = 0x10;
38+
pub(crate) const NUMBER_INF: u8 = 0x20;
39+
pub(crate) const NUMBER_NEG_INF: u8 = 0x30;
40+
pub(crate) const NUMBER_INT: u8 = 0x40;
41+
pub(crate) const NUMBER_UINT: u8 = 0x50;
42+
pub(crate) const NUMBER_FLOAT: u8 = 0x60;
43+
44+
// @todo support offset mode
45+
#[allow(dead_code)]
46+
pub(crate) const JENTRY_IS_OFF_FLAG: u32 = 0x80000000;
47+
pub(crate) const JENTRY_TYPE_MASK: u32 = 0x70000000;
48+
pub(crate) const JENTRY_OFF_LEN_MASK: u32 = 0x0FFFFFFF;
49+
50+
// JSON text constants
51+
pub(crate) const UNICODE_LEN: usize = 4;
52+
53+
// JSON text escape characters constants
54+
pub(crate) const BS: char = '\x5C'; // \\ Backslash
55+
pub(crate) const QU: char = '\x22'; // \" Double quotation mark
56+
pub(crate) const SD: char = '\x2F'; // \/ Slash or divide
57+
pub(crate) const BB: char = '\x08'; // \b Backspace
58+
pub(crate) const FF: char = '\x0C'; // \f Formfeed Page Break
59+
pub(crate) const NN: char = '\x0A'; // \n Newline
60+
pub(crate) const RR: char = '\x0D'; // \r Carriage Return
61+
pub(crate) const TT: char = '\x09'; // \t Horizontal Tab

src/de.rs

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// Copyright 2023 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::borrow::Cow;
16+
use std::collections::VecDeque;
17+
18+
use byteorder::BigEndian;
19+
use byteorder::ReadBytesExt;
20+
21+
use super::constants::*;
22+
use super::error::*;
23+
use super::jentry::JEntry;
24+
use super::number::Number;
25+
use super::parser::parse_value;
26+
use super::value::Object;
27+
use super::value::Value;
28+
29+
/// The binary `JSONB` contains three parts, `Header`, `JEntry` and `RawData`.
30+
/// This structure can be nested. Each group of structures starts with a `Header`.
31+
/// The upper-level `Value` will store the `Header` length or offset of
32+
/// the lower-level `Value`.
33+
34+
/// `Header` stores the type of the `Value`, include `Array`, `Object` and `Scalar`,
35+
/// `Scalar` has only one `Value`, and a corresponding `JEntry`.
36+
/// `Array` and `Object` are nested type, they have multiple lower-level `Values`.
37+
/// So the `Header` also stores the number of lower-level `Values`.
38+
39+
/// `JEntry` stores the types of `Scalar Value`, including `Null`, `True`, `False`,
40+
/// `Number`, `String` and `Container`. They have three different decode methods.
41+
/// 1. `Null`, `True` and `False` can be obtained by `JEntry`, no extra work required.
42+
/// 2. `Number` and `String` has related `RawData`, `JEntry` store the length
43+
/// or offset of this data, the `Value` can be read out and then decoded.
44+
/// 3. `Container` is actually a nested `Array` or `Object` with the same structure,
45+
/// `JEntry` store the length or offset of the lower-level `Header`,
46+
/// from where the same decode process can begin.
47+
48+
/// `RawData` is the encoded `Value`.
49+
/// `Number` is a variable-length `Decimal`, store both int and float value.
50+
/// `String` is the original string, can be borrowed directly without extra decode.
51+
/// `Array` and `Object` is a lower-level encoded `JSONB` value.
52+
/// The upper-level doesn't care about the specific content.
53+
/// Decode can be executed recursively.
54+
55+
/// Decode `JSONB` Value from binary bytes.
56+
pub fn from_slice(buf: &[u8]) -> Result<Value<'_>, Error> {
57+
let mut decoder = Decoder::new(buf);
58+
match decoder.decode() {
59+
Ok(value) => Ok(value),
60+
// for compatible with the first version of `JSON` text, parse it again
61+
Err(_) => parse_value(buf),
62+
}
63+
}
64+
65+
#[repr(transparent)]
66+
pub struct Decoder<'a> {
67+
buf: &'a [u8],
68+
}
69+
70+
impl<'a> Decoder<'a> {
71+
pub fn new(buf: &'a [u8]) -> Decoder<'a> {
72+
Self { buf }
73+
}
74+
75+
pub fn decode(&mut self) -> Result<Value<'a>, Error> {
76+
// Valid `JSONB` Value has at least one `Header`
77+
if self.buf.len() < 4 {
78+
return Err(Error::InvalidJsonb);
79+
}
80+
let value = self.decode_jsonb()?;
81+
Ok(value)
82+
}
83+
84+
// Read value type from the `Header`
85+
// `Scalar` has one `JEntry`
86+
// `Array` and `Object` store the numbers of elements
87+
fn decode_jsonb(&mut self) -> Result<Value<'a>, Error> {
88+
let container_header = self.buf.read_u32::<BigEndian>()?;
89+
90+
match container_header & CONTAINER_HEADER_TYPE_MASK {
91+
SCALAR_CONTAINER_TAG => {
92+
let encoded = self.buf.read_u32::<BigEndian>()?;
93+
let jentry = JEntry::decode_jentry(encoded);
94+
self.decode_scalar(jentry)
95+
}
96+
ARRAY_CONTAINER_TAG => self.decode_array(container_header),
97+
OBJECT_CONTAINER_TAG => self.decode_object(container_header),
98+
_ => Err(Error::InvalidJsonbHeader),
99+
}
100+
}
101+
102+
// Decode `Value` based on the `JEntry`
103+
// `Null` and `Boolean` don't need to read extra data
104+
// `Number` and `String` `JEntry` stores the length or offset of the data,
105+
// read them and decode to the `Value`
106+
// `Array` and `Object` need to read nested data from the lower-level `Header`
107+
fn decode_scalar(&mut self, jentry: JEntry) -> Result<Value<'a>, Error> {
108+
match jentry.type_code {
109+
NULL_TAG => Ok(Value::Null),
110+
TRUE_TAG => Ok(Value::Bool(true)),
111+
FALSE_TAG => Ok(Value::Bool(false)),
112+
STRING_TAG => {
113+
let offset = jentry.length as usize;
114+
let s = std::str::from_utf8(&self.buf[..offset]).unwrap();
115+
self.buf = &self.buf[offset..];
116+
Ok(Value::String(Cow::Borrowed(s)))
117+
}
118+
NUMBER_TAG => {
119+
let offset = jentry.length as usize;
120+
let n = Number::decode(&self.buf[..offset]);
121+
self.buf = &self.buf[offset..];
122+
Ok(Value::Number(n))
123+
}
124+
CONTAINER_TAG => self.decode_jsonb(),
125+
_ => Err(Error::InvalidJsonbJEntry),
126+
}
127+
}
128+
129+
// Decode the numbers of values from the `Header`,
130+
// then read all `JEntries`, finally decode the `Value` by `JEntry`
131+
fn decode_array(&mut self, container_header: u32) -> Result<Value<'a>, Error> {
132+
let length = (container_header & CONTAINER_HEADER_LEN_MASK) as usize;
133+
let jentries = self.decode_jentries(length)?;
134+
let mut values: Vec<Value> = Vec::with_capacity(length);
135+
// decode all values
136+
for jentry in jentries.into_iter() {
137+
let value = self.decode_scalar(jentry)?;
138+
values.push(value);
139+
}
140+
141+
let value = Value::Array(values);
142+
Ok(value)
143+
}
144+
145+
// The basic process is the same as that of `Array`
146+
// but first decode the keys and then decode the values
147+
fn decode_object(&mut self, container_header: u32) -> Result<Value<'a>, Error> {
148+
let length = (container_header & CONTAINER_HEADER_LEN_MASK) as usize;
149+
let mut jentries = self.decode_jentries(length * 2)?;
150+
151+
let mut keys: VecDeque<Value> = VecDeque::with_capacity(length);
152+
// decode all keys first
153+
for _ in 0..length {
154+
let jentry = jentries.pop_front().unwrap();
155+
let key = self.decode_scalar(jentry)?;
156+
keys.push_back(key);
157+
}
158+
159+
let mut obj = Object::new();
160+
// decode all values
161+
for _ in 0..length {
162+
let key = keys.pop_front().unwrap();
163+
let k = key.as_str().unwrap();
164+
let jentry = jentries.pop_front().unwrap();
165+
let value = self.decode_scalar(jentry)?;
166+
obj.insert(k.to_string(), value);
167+
}
168+
169+
let value = Value::Object(obj);
170+
Ok(value)
171+
}
172+
173+
// Decode `JEntries` for `Array` and `Object`
174+
fn decode_jentries(&mut self, length: usize) -> Result<VecDeque<JEntry>, Error> {
175+
let mut jentries: VecDeque<JEntry> = VecDeque::with_capacity(length);
176+
for _ in 0..length {
177+
let encoded = self.buf.read_u32::<BigEndian>()?;
178+
let jentry = JEntry::decode_jentry(encoded);
179+
jentries.push_back(jentry);
180+
}
181+
Ok(jentries)
182+
}
183+
}

0 commit comments

Comments
 (0)