Skip to content

Commit 0163be6

Browse files
committed
optimise tracer performance
1 parent 1739d4a commit 0163be6

File tree

9 files changed

+151
-25
lines changed

9 files changed

+151
-25
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,17 @@ rand_chacha = { version = "0.3", features = ["serde1"] }
5757
rand_core = "0.6"
5858
rayon = "1.10"
5959
rkyv = { version = "0.8", features = ["pointer_width_32"] }
60+
rustc-hash = "2.0.0"
6061
secp = "0.4.1"
6162
serde = { version = "1.0", features = ["derive", "rc"] }
6263
serde_json = "1.0"
64+
smallvec = { version = "1.13.2", features = [
65+
"const_generics",
66+
"const_new",
67+
"serde",
68+
"union",
69+
"write",
70+
] }
6371
strum = "0.26"
6472
strum_macros = "0.26"
6573
substrate-bn = { version = "0.6.0" }

ceno_emul/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@ itertools.workspace = true
1919
multilinear_extensions.workspace = true
2020
num-derive.workspace = true
2121
num-traits.workspace = true
22+
rayon.workspace = true
2223
rrs_lib = { package = "rrs-succinct", version = "0.1.0" }
24+
rustc-hash.workspace = true
2325
secp.workspace = true
2426
serde.workspace = true
27+
smallvec.workspace = true
2528
strum.workspace = true
2629
strum_macros.workspace = true
2730
substrate-bn.workspace = true

ceno_emul/src/chunked_vec.rs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
use rayon::iter::{IntoParallelIterator, ParallelIterator};
2+
use std::ops::{Index, IndexMut};
3+
4+
/// a chunked vector that grows in fixed-size chunks.
5+
#[derive(Default, Debug, Clone)]
6+
pub struct ChunkedVec<T> {
7+
chunks: Vec<Vec<T>>,
8+
chunk_size: usize,
9+
len: usize,
10+
}
11+
12+
impl<T: Default + Send> ChunkedVec<T> {
13+
/// create a new ChunkedVec with a given chunk size.
14+
pub fn new(chunk_size: usize) -> Self {
15+
assert!(chunk_size > 0, "chunk_size must be > 0");
16+
Self {
17+
chunks: Vec::new(),
18+
chunk_size,
19+
len: 0,
20+
}
21+
}
22+
23+
/// get the current number of elements.
24+
pub fn len(&self) -> usize {
25+
self.len
26+
}
27+
28+
/// returns true if the vector is empty.
29+
pub fn is_empty(&self) -> bool {
30+
self.len == 0
31+
}
32+
33+
/// access element by index (immutable).
34+
pub fn get(&self, index: usize) -> Option<&T> {
35+
if index >= self.len {
36+
return None;
37+
}
38+
let chunk_idx = index / self.chunk_size;
39+
let within_idx = index % self.chunk_size;
40+
self.chunks.get(chunk_idx)?.get(within_idx)
41+
}
42+
43+
/// access element by index (mutable).
44+
/// get mutable reference to element at index, auto-creating chunks as needed
45+
pub fn get_or_create(&mut self, index: usize) -> &mut T {
46+
let chunk_idx = index / self.chunk_size;
47+
let within_idx = index % self.chunk_size;
48+
49+
// Ensure enough chunks exist
50+
if chunk_idx >= self.chunks.len() {
51+
let to_create = chunk_idx + 1 - self.chunks.len();
52+
53+
// Use rayon to create all missing chunks in parallel
54+
let mut new_chunks: Vec<Vec<T>> = (0..to_create)
55+
.map(|_| {
56+
(0..self.chunk_size)
57+
.into_par_iter()
58+
.map(|_| Default::default())
59+
.collect::<Vec<_>>()
60+
})
61+
.collect();
62+
63+
self.chunks.append(&mut new_chunks);
64+
}
65+
66+
let chunk = &mut self.chunks[chunk_idx];
67+
68+
// Update the overall length
69+
if index >= self.len {
70+
self.len = index + 1;
71+
}
72+
73+
&mut chunk[within_idx]
74+
}
75+
}
76+
77+
impl<T: Default + Send> Index<usize> for ChunkedVec<T> {
78+
type Output = T;
79+
80+
fn index(&self, index: usize) -> &Self::Output {
81+
self.get(index).expect("index out of bounds")
82+
}
83+
}
84+
85+
impl<T: Default + Send> IndexMut<usize> for ChunkedVec<T> {
86+
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
87+
self.get_or_create(index)
88+
}
89+
}

ceno_emul/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ mod platform;
77
pub use platform::{CENO_PLATFORM, Platform};
88

99
mod tracer;
10-
pub use tracer::{Change, MemOp, ReadOp, StepRecord, Tracer, WriteOp};
10+
pub use tracer::{Change, MemOp, NextAccessPair, ReadOp, StepRecord, Tracer, WriteOp};
1111

1212
mod vm_state;
1313
pub use vm_state::VMState;
@@ -44,4 +44,6 @@ pub mod utils;
4444

4545
pub mod test_utils;
4646

47+
mod chunked_vec;
48+
pub use chunked_vec::ChunkedVec as NextCycleAccess;
4749
pub mod host_utils;

ceno_emul/src/tracer.rs

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
use std::{
2-
collections::{BTreeMap, HashMap},
3-
fmt, mem,
4-
};
1+
use rustc_hash::FxHashMap;
2+
use smallvec::SmallVec;
3+
use std::{collections::BTreeMap, fmt, mem};
54

65
use ceno_rt::WORD_SIZE;
76

87
use crate::{
98
CENO_PLATFORM, InsnKind, Instruction, PC_STEP_SIZE, Platform,
109
addr::{ByteAddr, Cycle, RegIdx, Word, WordAddr},
10+
chunked_vec::ChunkedVec,
1111
encode_rv32,
1212
syscalls::{SyscallEffects, SyscallWitness},
1313
};
@@ -39,6 +39,8 @@ pub struct StepRecord {
3939
syscall: Option<SyscallWitness>,
4040
}
4141

42+
pub type NextAccessPair = SmallVec<[(WordAddr, Cycle); 1]>;
43+
4244
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
4345
pub struct MemOp<T> {
4446
/// Virtual Memory Address.
@@ -305,8 +307,8 @@ pub struct Tracer {
305307
// record each section max access address
306308
// (start_addr -> (start_addr, end_addr, min_access_addr, max_access_addr))
307309
mmio_min_max_access: Option<BTreeMap<WordAddr, (WordAddr, WordAddr, WordAddr, WordAddr)>>,
308-
latest_accesses: HashMap<WordAddr, Cycle>,
309-
next_accesses: HashMap<(WordAddr, Cycle), Cycle>,
310+
latest_accesses: FxHashMap<WordAddr, Cycle>,
311+
next_accesses: ChunkedVec<NextAccessPair>,
310312
}
311313

312314
impl Default for Tracer {
@@ -363,8 +365,8 @@ impl Tracer {
363365
cycle: Self::SUBCYCLES_PER_INSN,
364366
..StepRecord::default()
365367
},
366-
latest_accesses: HashMap::new(),
367-
next_accesses: HashMap::new(),
368+
latest_accesses: FxHashMap::default(),
369+
next_accesses: ChunkedVec::new(1 << 20),
368370
}
369371
}
370372

@@ -475,17 +477,19 @@ impl Tracer {
475477
pub fn track_access(&mut self, addr: WordAddr, subcycle: Cycle) -> Cycle {
476478
let cur_cycle = self.record.cycle + subcycle;
477479
let prev_cycle = self.latest_accesses.insert(addr, cur_cycle).unwrap_or(0);
478-
self.next_accesses.insert((addr, prev_cycle), cur_cycle);
480+
self.next_accesses
481+
.get_or_create(prev_cycle as usize)
482+
.push((addr, cur_cycle));
479483
prev_cycle
480484
}
481485

482486
/// Return all the addresses that were accessed and the cycle when they were last accessed.
483-
pub fn final_accesses(&self) -> &HashMap<WordAddr, Cycle> {
487+
pub fn final_accesses(&self) -> &FxHashMap<WordAddr, Cycle> {
484488
&self.latest_accesses
485489
}
486490

487491
/// Return all the addresses that were accessed and the cycle when they were last accessed.
488-
pub fn next_accesses(self) -> HashMap<(WordAddr, Cycle), Cycle> {
492+
pub fn next_accesses(self) -> ChunkedVec<NextAccessPair> {
489493
self.next_accesses
490494
}
491495

ceno_emul/tests/test_vm_trace.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
#![allow(clippy::unusual_byte_groupings)]
22
use anyhow::Result;
3-
use std::{
4-
collections::{BTreeMap, HashMap},
5-
sync::Arc,
6-
};
3+
use rustc_hash::FxHashMap;
4+
use std::{collections::BTreeMap, sync::Arc};
75

86
use ceno_emul::{
97
CENO_PLATFORM, Cycle, EmuContext, InsnKind, Instruction, Platform, Program, StepRecord, Tracer,
@@ -111,8 +109,8 @@ fn expected_ops_fibonacci_20() -> Vec<InsnKind> {
111109
}
112110

113111
/// Reconstruct the last access of each register.
114-
fn expected_final_accesses_fibonacci_20() -> HashMap<WordAddr, Cycle> {
115-
let mut accesses = HashMap::new();
112+
fn expected_final_accesses_fibonacci_20() -> FxHashMap<WordAddr, Cycle> {
113+
let mut accesses = FxHashMap::default();
116114
let x = |i| WordAddr::from(Platform::register_vma(i));
117115
const C: Cycle = Tracer::SUBCYCLES_PER_INSN;
118116

ceno_zkvm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ witness.workspace = true
3434
itertools.workspace = true
3535
ndarray.workspace = true
3636
prettytable-rs.workspace = true
37+
rustc-hash.workspace = true
3738
strum.workspace = true
3839
strum_macros.workspace = true
3940
tracing.workspace = true

ceno_zkvm/src/e2e.rs

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ use crate::{
1616
tables::{MemFinalRecord, MemInitRecord, ProgramTableCircuit, ProgramTableConfig},
1717
};
1818
use ceno_emul::{
19-
Addr, ByteAddr, CENO_PLATFORM, Cycle, EmuContext, InsnKind, IterAddresses, Platform, Program,
20-
StepRecord, Tracer, VMState, WORD_SIZE, Word, WordAddr, host_utils::read_all_messages,
19+
Addr, ByteAddr, CENO_PLATFORM, Cycle, EmuContext, InsnKind, IterAddresses, NextAccessPair,
20+
NextCycleAccess, Platform, Program, StepRecord, Tracer, VMState, WORD_SIZE, Word, WordAddr,
21+
host_utils::read_all_messages,
2122
};
2223
use clap::ValueEnum;
2324
use either::Either;
@@ -147,7 +148,7 @@ pub struct ShardContext<'a> {
147148
shards: Shards,
148149
max_cycle: Cycle,
149150
// TODO optimize this map as it's super huge
150-
addr_future_accesses: Cow<'a, HashMap<(WordAddr, Cycle), Cycle>>,
151+
addr_future_accesses: Cow<'a, NextCycleAccess<NextAccessPair>>,
151152
read_thread_based_record_storage:
152153
Either<Vec<BTreeMap<WordAddr, RAMRecord>>, &'a mut BTreeMap<WordAddr, RAMRecord>>,
153154
write_thread_based_record_storage:
@@ -161,7 +162,7 @@ impl<'a> Default for ShardContext<'a> {
161162
Self {
162163
shards: Shards::default(),
163164
max_cycle: Cycle::default(),
164-
addr_future_accesses: Cow::Owned(HashMap::new()),
165+
addr_future_accesses: Cow::Owned(Default::default()),
165166
read_thread_based_record_storage: Either::Left(
166167
(0..max_threads)
167168
.into_par_iter()
@@ -183,7 +184,7 @@ impl<'a> ShardContext<'a> {
183184
pub fn new(
184185
shards: Shards,
185186
executed_instructions: usize,
186-
addr_future_accesses: HashMap<(WordAddr, Cycle), Cycle>,
187+
addr_future_accesses: NextCycleAccess<NextAccessPair>,
187188
) -> Self {
188189
// current strategy: at least each shard deal with one instruction
189190
let max_num_shards = shards.max_num_shards.min(executed_instructions);
@@ -329,8 +330,21 @@ impl<'a> ShardContext<'a> {
329330
}
330331

331332
// check write to external mem bus
332-
if let Some(future_touch_cycle) = self.addr_future_accesses.get(&(addr, cycle))
333-
&& *future_touch_cycle >= self.cur_shard_cycle_range.end as Cycle
333+
if let Some(future_touch_cycle) =
334+
self.addr_future_accesses
335+
.get(cycle as usize)
336+
.and_then(|res| {
337+
if res.len() == 1 {
338+
Some(res[0].1)
339+
} else if res.len() > 1 {
340+
res.iter()
341+
.find(|(m_addr, _)| *m_addr == addr)
342+
.map(|(_, cycle)| *cycle)
343+
} else {
344+
None
345+
}
346+
})
347+
&& future_touch_cycle >= self.cur_shard_cycle_range.end as Cycle
334348
&& self.is_current_shard_cycle(cycle)
335349
{
336350
let ram_record = self

0 commit comments

Comments
 (0)