diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9804dee..3282edb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,7 +16,8 @@ jobs:
     strategy:
       matrix:
         toolchain: ["stable"]
-        features: ["", "--features serde"]
+        features: ["--features orx-parallel", "--features serde"]
+        no_std_features: ["--features serde"]
 
     steps:
     - uses: actions/checkout@v4
@@ -36,24 +37,24 @@ jobs:
       run: cargo install cargo-no-std-check
       
     - name: Build
-      run: cargo build --verbose ${{ matrix.features }}
+      run: cargo build --no-default-features --verbose ${{ matrix.features }}
     - name: Build-32bit
-      run: cargo build --verbose --target i686-unknown-linux-musl ${{ matrix.features }}
+      run: cargo build --no-default-features --verbose --target i686-unknown-linux-musl ${{ matrix.features }}
     - name: Build-wasm
-      run: cargo build --verbose --target wasm32v1-none ${{ matrix.features }}
+      run: cargo build --no-default-features --verbose --target wasm32v1-none ${{ matrix.no_std_features }}
 
     - name: Test
-      run: cargo test --verbose ${{ matrix.features }}
+      run: cargo test --no-default-features --verbose ${{ matrix.features }}
     - name: Test-32bit
-      run: cargo test --verbose --target i686-unknown-linux-musl ${{ matrix.features }}
+      run: cargo test --no-default-features --verbose --target i686-unknown-linux-musl ${{ matrix.features }}
     - name: Check-wasm
-      run: cargo check --verbose --target wasm32v1-none ${{ matrix.features }}
+      run: cargo check --no-default-features --verbose --target wasm32v1-none ${{ matrix.no_std_features }}
 
     - name: Clippy
-      run: cargo clippy ${{ matrix.features }} -- -D warnings --verbose
+      run: cargo clippy --no-default-features ${{ matrix.features }} -- -D warnings --verbose
 
     - name: Miri
-      run: cargo +nightly miri test --verbose ${{ matrix.features }}
+      run: cargo +nightly miri test --lib --bins --tests --no-default-features --verbose ${{ matrix.features }}
 
     - name: NoStd
-      run: cargo +nightly no-std-check ${{ matrix.features }}
+      run: cargo +nightly no-std-check --no-default-features ${{ matrix.no_std_features }}
diff --git a/Cargo.toml b/Cargo.toml
index 0ab6576..c376fd7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,9 @@
 [package]
 name = "orx-tree"
-version = "1.5.0"
+version = "1.6.0"
 edition = "2024"
 authors = ["orxfun <orx.ugur.arikan@gmail.com>"]
-description = "A beautiful tree 🌳 with convenient and efficient growth, mutation and traversal features."
+description = "A beautiful tree 🌳 with convenient and efficient growth, mutation and traversal features with support for parallel computation."
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/orxfun/orx-tree/"
 keywords = ["tree", "data-structures", "traversal", "traverse", "binarytree"]
@@ -13,19 +13,26 @@ categories = ["data-structures", "algorithms", "rust-patterns", "no-std"]
 orx-iterable = { version = "1.3.0", default-features = false }
 orx-pseudo-default = { version = "2.1.0", default-features = false }
 orx-pinned-vec = "3.16.0"
-orx-split-vec = "3.16.0"
-orx-selfref-col = "2.8.0"
 orx-self-or = "1.2.0"
 serde = { version = "1.0.219", optional = true, default-features = false }
-
+orx-split-vec = { version = "3.17.0", default-features = false }
+orx-selfref-col = { version = "2.9.0", default-features = false }
+orx-concurrent-iter = { version = "2.1.0", default-features = false }
+orx-parallel = { version = "2.1.0", default-features = false, optional = true }
 
 [dev-dependencies]
-test-case = { version = "3.3.1", default-features = false }
+clap = { version = "4.5.38", features = ["derive"] }
+criterion = "0.5.1"
+rayon = { version = "1.10.0" }
 serde_json = { version = "1.0.140", default-features = false, features = [
     "std",
 ] }
+test-case = { version = "3.3.1", default-features = false }
 
 [features]
-default = []
-std = []
+default = ["orx-parallel"]
 serde = ["dep:serde"]
+
+[[bench]]
+name = "parallelization_ref"
+harness = false
diff --git a/README.md b/README.md
index b6662cd..f193915 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![orx-tree crate](https://img.shields.io/crates/d/orx-tree.svg)](https://crates.io/crates/orx-tree)
 [![orx-tree documentation](https://docs.rs/orx-tree/badge.svg)](https://docs.rs/orx-tree)
 
-A beautiful tree 🌳 with convenient and efficient growth, mutation and traversal features.
+A beautiful tree 🌳 with convenient and efficient growth, mutation and traversal features with support for parallel computation.
 
 ## Features
 
@@ -130,9 +130,18 @@ Alternatively, we can turn a mutable node into an [`into_walk`](https://docs.rs/
 * We can iterate over the removed nodes in the order of the generic traversal parameter and use the data however we need.
 * Or we can attach the removed subtree at a desired position of another tree by passing it to methods such as [`push_child_tree(subtree)`](https://docs.rs/orx-tree/latest/orx_tree/struct.NodeMut.html#method.push_child_tree).
 
-## Opt-in Features
+## Features
+
+* **orx-parallel**: Tree allows efficient parallel processing through [concurrent iterators](https://crates.io/crates/orx-concurrent-iter) and [parallel iterators](https://crates.io/crates/orx-parallel).
+  * This feature is added as default and requires **std**; hence, please use `cargo add orx-tree --no-default-features` for **no-std** use cases.
+  * Currently, parallel iteration over all nodes of the tree in arbitrary order is supported by methods [`par`](https://docs.rs/orx-tree/latest/orx_tree/struct.Tree.html#method.par) and [`into_par`](https://docs.rs/orx-tree/latest/orx_tree/struct.Tree.html#method.into_par).
+  * Parallelization of all walks or traversals in particular order are under development.
+  * Parallelization examples can be found in [`demo_parallelization`](https://github.com/orxfun/orx-tree/blob/main/examples/demo_parallelization.rs) example.
+  * Importantly note that the tree defines its own concurrent iterators, and hence, allows for efficient computation, which is often not possible with generic implementations such as rayon's `par_bridge`. In order to check the impact in performance, you may use the lightweight benchmark example [`bench_parallelization`](https://github.com/orxfun/orx-linked-list/blob/main/examples/bench_parallelization.rs):
+    * `Sequential computation over Tree : 18.96s`
+    * `Parallelized over Tree using orx-parallel : 6.02s`
+    * `Parallelized over Tree using rayon's par-bridge : 81.10s`
 
-* **std**: This is a no-std crate by default, and hence, "std" feature needs to be included when necessary.
 * **serde**: Tree implements `Serialize` and `Deserialize` traits; the "serde" feature needs to be added when required. It uses a linearized representation of the tree as a [`DepthFirstSequence`](https://docs.rs/orx-tree/latest/orx_tree/struct.DepthFirstSequence.html). You may find de-serialization examples in the corresponding [test file](https://github.com/orxfun/orx-tree/blob/main/tests/serde.rs).
 
 # Examples
diff --git a/benches/parallelization_owned.rs b/benches/parallelization_owned.rs
new file mode 100644
index 0000000..74aac32
--- /dev/null
+++ b/benches/parallelization_owned.rs
@@ -0,0 +1,135 @@
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+#[cfg(feature = "orx-parallel")]
+use orx_parallel::ParIter;
+use orx_tree::*;
+use rayon::iter::{ParallelBridge, ParallelIterator};
+
+fn build_tree(n: usize) -> DynTree<String> {
+    let mut tree = DynTree::new(0.to_string());
+    let mut dfs = Traversal.dfs().over_nodes();
+    while tree.len() < n {
+        let root = tree.root();
+        let x: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+        for idx in x.iter() {
+            let count = tree.len();
+            let mut node = tree.node_mut(idx);
+            let num_children = 20;
+            for j in 0..num_children {
+                node.push_child((count + j).to_string());
+            }
+        }
+    }
+    tree
+}
+
+fn fibonacci(n: i64) -> i64 {
+    let mut a = 0;
+    let mut b = 1;
+    for _ in 0..n {
+        let c = a + b;
+        a = b;
+        b = c;
+    }
+    a
+}
+
+fn tree_into_iter(tree: DynTree<String>) -> i64 {
+    tree.into_iter()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_into_dfs(mut tree: DynTree<String>) -> i64 {
+    tree.root_mut()
+        .into_walk::<Dfs>()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_into_bfs(mut tree: DynTree<String>) -> i64 {
+    tree.root_mut()
+        .into_walk::<Bfs>()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_into_par_x(tree: DynTree<String>) -> i64 {
+    tree.into_par()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_into_iter_rayon(tree: DynTree<String>) -> i64 {
+    tree.into_iter()
+        .par_bridge()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn bench(c: &mut Criterion) {
+    let treatments = vec![1_024 * 64];
+
+    let mut group = c.benchmark_group("parallelization_owned");
+
+    for n in &treatments {
+        let data = build_tree(*n);
+
+        let expected = tree_into_iter(data.clone());
+
+        group.bench_with_input(BenchmarkId::new("Tree::into_iter()", n), n, |b, _| {
+            let result = tree_into_iter(data.clone());
+            assert_eq!(result, expected);
+            b.iter(|| tree_into_iter(data.clone()))
+        });
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::root().into_walk::<Dfs>()", n),
+            n,
+            |b, _| {
+                let result = tree_into_dfs(data.clone());
+                assert_eq!(result, expected);
+                b.iter(|| tree_into_dfs(data.clone()))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::root().into_walk::<Bfs>()", n),
+            n,
+            |b, _| {
+                let result = tree_into_bfs(data.clone());
+                assert_eq!(result, expected);
+                b.iter(|| tree_into_bfs(data.clone()))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::into_par_x() - orx-parallel", n),
+            n,
+            |b, _| {
+                let result = tree_into_par_x(data.clone());
+                assert_eq!(result, expected);
+                b.iter(|| tree_into_par_x(data.clone()))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::into_iter().par_bridge() - rayon", n),
+            n,
+            |b, _| {
+                let result = tree_into_iter_rayon(data.clone());
+                assert_eq!(result, expected);
+                b.iter(|| tree_into_iter_rayon(data.clone()))
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/benches/parallelization_ref.rs b/benches/parallelization_ref.rs
new file mode 100644
index 0000000..5a8cf0c
--- /dev/null
+++ b/benches/parallelization_ref.rs
@@ -0,0 +1,135 @@
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+#[cfg(feature = "orx-parallel")]
+use orx_parallel::ParIter;
+use orx_tree::*;
+use rayon::iter::{ParallelBridge, ParallelIterator};
+
+fn build_tree(n: usize) -> DynTree<String> {
+    let mut tree = DynTree::new(0.to_string());
+    let mut dfs = Traversal.dfs().over_nodes();
+    while tree.len() < n {
+        let root = tree.root();
+        let x: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+        for idx in x.iter() {
+            let count = tree.len();
+            let mut node = tree.node_mut(idx);
+            let num_children = 20;
+            for j in 0..num_children {
+                node.push_child((count + j).to_string());
+            }
+        }
+    }
+    tree
+}
+
+fn fibonacci(n: i64) -> i64 {
+    let mut a = 0;
+    let mut b = 1;
+    for _ in 0..n {
+        let c = a + b;
+        a = b;
+        b = c;
+    }
+    a
+}
+
+fn tree_iter(tree: &DynTree<String>) -> i64 {
+    tree.iter()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_dfs(tree: &DynTree<String>) -> i64 {
+    tree.root()
+        .walk::<Dfs>()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_bfs(tree: &DynTree<String>) -> i64 {
+    tree.root()
+        .walk::<Bfs>()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_par_x(tree: &DynTree<String>) -> i64 {
+    tree.par()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn tree_iter_rayon(tree: &DynTree<String>) -> i64 {
+    tree.iter()
+        .par_bridge()
+        .map(|x| x.parse::<usize>().unwrap())
+        .map(|x| fibonacci(x as i64 % 500))
+        .sum()
+}
+
+fn bench(c: &mut Criterion) {
+    let treatments = vec![1_024 * 64];
+
+    let mut group = c.benchmark_group("parallelization_ref");
+
+    for n in &treatments {
+        let data = build_tree(*n);
+
+        let expected = tree_iter(&data);
+
+        group.bench_with_input(BenchmarkId::new("Tree::iter()", n), n, |b, _| {
+            let result = tree_iter(&data);
+            assert_eq!(result, expected);
+            b.iter(|| tree_iter(&data))
+        });
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::root().walk::<Dfs>()", n),
+            n,
+            |b, _| {
+                let result = tree_dfs(&data);
+                assert_eq!(result, expected);
+                b.iter(|| tree_dfs(&data))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::root().walk::<Bfs>()", n),
+            n,
+            |b, _| {
+                let result = tree_bfs(&data);
+                assert_eq!(result, expected);
+                b.iter(|| tree_bfs(&data))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::par_x() - orx-parallel", n),
+            n,
+            |b, _| {
+                let result = tree_par_x(&data);
+                assert_eq!(result, expected);
+                b.iter(|| tree_par_x(&data))
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Tree::iter().par_bridge() - rayon", n),
+            n,
+            |b, _| {
+                let result = tree_iter_rayon(&data);
+                assert_eq!(result, expected);
+                b.iter(|| tree_iter_rayon(&data))
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench);
+criterion_main!(benches);
diff --git a/examples/bench_parallelization.rs b/examples/bench_parallelization.rs
new file mode 100644
index 0000000..a7f9fe7
--- /dev/null
+++ b/examples/bench_parallelization.rs
@@ -0,0 +1,116 @@
+// cargo run --release --features orx-parallel --example bench_parallelization
+// cargo run --release --features orx-parallel --example bench_parallelization -- --help
+// cargo run --release --features orx-parallel --example bench_parallelization -- --len 50000 --num-repetitions 20
+
+mod utils;
+
+use clap::Parser;
+use orx_tree::*;
+use rayon::iter::{ParallelBridge, ParallelIterator};
+use utils::timed_collect_all;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Number of items in the input iterator.
+    #[arg(long, default_value_t = 1_000_000)]
+    len: usize,
+    /// Number of repetitions to measure time; total time will be reported.
+    #[arg(long, default_value_t = 100)]
+    num_repetitions: usize,
+}
+
+fn fibonacci(n: usize) -> usize {
+    let mut a = 0;
+    let mut b = 1;
+    for _ in 0..n {
+        let c = a + b;
+        a = b;
+        b = c;
+    }
+    a
+}
+
+fn build_tree(total_depth: usize) -> DaryTree<4, usize> {
+    let mut tree = DaryTree::new(0);
+    let mut dfs = Traversal.dfs().over_nodes();
+
+    for _ in 0..total_depth {
+        let root = tree.root();
+        let leaves: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+        for idx in leaves {
+            let count = tree.len();
+            let mut node = tree.node_mut(&idx);
+            for j in 0..4 {
+                node.push_child(count + j);
+            }
+        }
+    }
+
+    tree
+}
+
+fn main() {
+    let args = Args::parse();
+
+    let mut expected_output = {
+        let tree = build_tree(10);
+
+        tree.iter()
+            .filter(|x| *x % 3 != 0)
+            .map(|x| x + fibonacci(x % 1000))
+            .filter_map(|x| (x % 2 == 0).then(|| x.to_string()))
+            .collect::<Vec<_>>()
+    };
+    expected_output.sort();
+
+    let computations: Vec<(&str, Box<dyn Fn() -> Vec<String>>)> = vec![
+        #[cfg(feature = "orx-parallel")]
+        (
+            "Sequential computation over Tree",
+            Box::new(move || {
+                let tree = build_tree(10);
+
+                tree.iter()
+                    .filter(|x| *x % 3 != 0)
+                    .map(|x| x + fibonacci(x % 1000))
+                    .filter_map(|x| (x % 2 == 0).then(|| x.to_string()))
+                    .collect::<Vec<_>>()
+            }),
+        ),
+        #[cfg(feature = "orx-parallel")]
+        (
+            "Parallelized over Tree using orx-parallel",
+            Box::new(move || {
+                let tree = build_tree(10);
+
+                tree.par() // replace iter (into_iter) with par (into_par) to parallelize !
+                    .filter(|x| *x % 3 != 0)
+                    .map(|x| x + fibonacci(x % 1000))
+                    .filter(|x| x % 2 == 0)
+                    .map(|x| x.to_string())
+                    .collect::<Vec<_>>()
+            }),
+        ),
+        (
+            "Parallelized over Tree using rayon's par-bridge",
+            Box::new(move || {
+                let tree = build_tree(10);
+
+                tree.iter()
+                    .par_bridge()
+                    .filter(|x| *x % 3 != 0)
+                    .map(|x| x + fibonacci(x % 1000))
+                    .filter(|x| x % 2 == 0)
+                    .map(|x| x.to_string())
+                    .collect::<Vec<_>>()
+            }),
+        ),
+    ];
+
+    timed_collect_all(
+        "benchmark_parallelization",
+        args.num_repetitions,
+        &expected_output,
+        &computations,
+    );
+}
diff --git a/examples/demo_parallelization.rs b/examples/demo_parallelization.rs
new file mode 100644
index 0000000..e537424
--- /dev/null
+++ b/examples/demo_parallelization.rs
@@ -0,0 +1,66 @@
+// cargo run --release --features orx-parallel --example demo_parallelization
+
+use orx_tree::*;
+
+fn build_tree(total_depth: usize, num_children: usize) -> DynTree<String> {
+    let mut tree = DynTree::new(0.to_string());
+    let mut dfs = Traversal.dfs().over_nodes();
+
+    for _ in 0..total_depth {
+        let root = tree.root();
+        let leaves: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+        for idx in leaves {
+            let count = tree.len();
+            let mut node = tree.node_mut(&idx);
+            for j in 0..num_children {
+                node.push_child((count + j).to_string());
+            }
+        }
+    }
+
+    tree
+}
+
+fn main() {
+    let input = build_tree(10, 4);
+    let expected_num_characters = 5234569;
+
+    // computation using iterators
+
+    let total_num_characters: usize = input
+        .iter()
+        .filter(|x| !x.starts_with('1'))
+        .map(|x| x.len())
+        .sum();
+    assert_eq!(total_num_characters, expected_num_characters);
+
+    #[cfg(feature = "orx-parallel")]
+    {
+        // computation using parallel iterator: replace `iter()` with `par()`
+
+        let total_num_characters: usize = input
+            .par()
+            .filter(|x| !x.starts_with('1'))
+            .map(|x| x.len())
+            .sum();
+        assert_eq!(total_num_characters, expected_num_characters);
+
+        // configure parallel computation
+        let total_num_characters: usize = input
+            .par()
+            .filter(|x| !x.starts_with('1'))
+            .num_threads(2)
+            .chunk_size(64)
+            .map(|x| x.len())
+            .sum();
+        assert_eq!(total_num_characters, expected_num_characters);
+
+        // consuming parallel iterator: replace `into_iter` with `into_par`
+        let total_num_characters: usize = input
+            .into_par()
+            .filter(|x| !x.starts_with('1'))
+            .map(|x| x.len())
+            .sum();
+        assert_eq!(total_num_characters, expected_num_characters);
+    }
+}
diff --git a/examples/utils/benchmark_utils.rs b/examples/utils/benchmark_utils.rs
new file mode 100644
index 0000000..7d69790
--- /dev/null
+++ b/examples/utils/benchmark_utils.rs
@@ -0,0 +1,96 @@
+#![allow(dead_code)]
+
+use std::{
+    fmt::Debug,
+    hint::black_box,
+    time::{Duration, SystemTime},
+};
+
+// reduce
+
+fn timed_reduce<F, O>(num_repetitions: usize, expected_output: &Option<O>, fun: F) -> Duration
+where
+    F: Fn() -> O,
+    O: PartialEq + Debug,
+{
+    if let Some(expected_output) = expected_output.as_ref() {
+        let result = fun();
+        assert_eq!(&result, expected_output);
+    }
+
+    // warm up
+    for _ in 0..10 {
+        let _ = black_box(fun());
+    }
+
+    // measurement
+
+    let now = SystemTime::now();
+    for _ in 0..num_repetitions {
+        let result = black_box(fun());
+        if let Some(expected_output) = expected_output.as_ref() {
+            assert_eq!(&result, expected_output);
+        }
+    }
+    now.elapsed().unwrap()
+}
+
+pub fn timed_reduce_all<O>(
+    benchmark_name: &str,
+    num_repetitions: usize,
+    expected_output: Option<O>,
+    computations: &[(&str, Box<dyn Fn() -> O>)],
+) where
+    O: PartialEq + Debug + Clone,
+{
+    println!("\n{} {} {}", "#".repeat(10), benchmark_name, "#".repeat(10));
+    for (name, fun) in computations {
+        let duration = timed_reduce(num_repetitions, &expected_output, fun);
+        println!("{:>10} : {:?}", name, duration);
+    }
+    println!("{}\n", "#".repeat(10 + 10 + 2 + benchmark_name.len()));
+}
+
+// collect
+
+fn timed_collect<F, Out, O>(num_repetitions: usize, expected_output: &[O], fun: F) -> Duration
+where
+    F: Fn() -> Out,
+    Out: IntoIterator<Item = O>,
+    O: PartialEq + Ord + Debug,
+{
+    let result = fun();
+    let mut result = result.into_iter().collect::<Vec<_>>();
+    result.sort();
+    assert_eq!(result, expected_output);
+
+    // warm up
+    for _ in 0..10 {
+        let _ = black_box(fun());
+    }
+
+    // measurement
+
+    let now = SystemTime::now();
+    for _ in 0..num_repetitions {
+        let _ = black_box(fun());
+    }
+    now.elapsed().unwrap()
+}
+
+pub fn timed_collect_all<Out, O>(
+    benchmark_name: &str,
+    num_repetitions: usize,
+    expected_output: &[O],
+    computations: &[(&str, Box<dyn Fn() -> Out>)],
+) where
+    Out: IntoIterator<Item = O>,
+    O: PartialEq + Ord + Debug,
+{
+    println!("\n{} {} {}", "#".repeat(10), benchmark_name, "#".repeat(10));
+    for (name, fun) in computations {
+        let duration = timed_collect(num_repetitions, expected_output, fun);
+        println!("{:>10} : {:?}", name, duration);
+    }
+    println!("{}\n", "#".repeat(10 + 10 + 2 + benchmark_name.len()));
+}
diff --git a/examples/utils/mod.rs b/examples/utils/mod.rs
new file mode 100644
index 0000000..c2042e0
--- /dev/null
+++ b/examples/utils/mod.rs
@@ -0,0 +1,5 @@
+#![allow(unused_imports)]
+
+mod benchmark_utils;
+
+pub use benchmark_utils::*;
diff --git a/src/lib.rs b/src/lib.rs
index 7dd5613..632a549 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -12,7 +12,7 @@
 )]
 #![no_std]
 
-#[cfg(any(test, feature = "std"))]
+#[cfg(test)]
 extern crate std;
 
 extern crate alloc;
@@ -62,3 +62,6 @@ pub use orx_selfref_col::NodeIdxError;
 
 // RE-IMPORT
 pub use orx_iterable::{Collection, CollectionMut};
+
+#[cfg(feature = "orx-parallel")]
+pub use orx_parallel::*;
diff --git a/src/tree.rs b/src/tree.rs
index 6987f10..da613f7 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -948,6 +948,165 @@ where
         }
     }
 
+    // parallelization
+
+    /// Creates a parallel iterator over references to the elements of the tree in **arbitrary order**.
+    ///
+    /// Note that `par` is parallel counterpart of `iter`.
+    ///
+    /// In order to iterate over data in a particular order, please use traversers with [`walk`], [`walk_mut`]
+    /// or [`into_walk`] methods.
+    ///
+    /// Please see [`ParIter`] for details of the parallel computation.
+    /// In brief, computation is defined as chain of iterator transformations and parallelization
+    /// is handled by the underlying parallel executor.
+    ///
+    /// Requires **orx-parallel** feature.
+    ///
+    /// [`ParIter`]: orx_parallel::ParIter
+    /// [`walk`]: crate::NodeRef::walk
+    /// [`walk_mut`]: crate::NodeMut::walk_mut
+    /// [`into_walk`]: crate::NodeMut::into_walk
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use orx_tree::*;
+    ///
+    /// let num_children = 4;
+    /// let total_depth = 10;
+    ///
+    /// let mut tree = DynTree::new(0.to_string());
+    /// let mut dfs = Traversal.dfs().over_nodes();
+    ///
+    /// for _ in 0..total_depth {
+    ///     let root = tree.root();
+    ///     let leaves: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+    ///     for idx in leaves {
+    ///         let count = tree.len();
+    ///         let mut node = tree.node_mut(&idx);
+    ///         for j in 0..num_children {
+    ///             node.push_child((count + j).to_string());
+    ///         }
+    ///     }
+    /// }
+    ///
+    /// let seq_result: usize = tree
+    ///     .iter()
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    ///
+    /// // compute in parallel with default configuration
+    /// let par_result = tree
+    ///     .par() // replace iter() with par()
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    /// assert_eq!(seq_result, par_result);
+    ///
+    /// // configure parallel computation
+    /// let par_result = tree
+    ///     .par()
+    ///     .num_threads(4)
+    ///     .chunk_size(64)
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    /// assert_eq!(seq_result, par_result);
+    /// ```
+    #[cfg(feature = "orx-parallel")]
+    pub fn par(&self) -> impl orx_parallel::ParIter<Item = &V::Item>
+    where
+        V::Item: Send + Sync,
+        for<'a> &'a <P as PinnedStorage>::PinnedVec<V>:
+            orx_concurrent_iter::IntoConcurrentIter<Item = &'a crate::aliases::N<V>>,
+    {
+        use orx_parallel::*;
+
+        let pinned = self.0.nodes();
+        pinned.par().filter_map(|x| x.data())
+    }
+
+    /// Consumes the tree and creates a parallel iterator over owned elements of the tree in **arbitrary order**.
+    ///
+    /// Note that `into_par` is parallel counterpart of `into_iter`.
+    ///
+    /// In order to iterate over data in a particular order, please use traversers with [`walk`], [`walk_mut`]
+    /// or [`into_walk`] methods.
+    ///
+    /// Please see [`ParIter`] for details of the parallel computation.
+    /// In brief, computation is defined as chain of iterator transformations and parallelization
+    /// is handled by the underlying parallel executor.
+    ///
+    /// Requires **orx-parallel** feature.
+    ///
+    /// [`ParIter`]: orx_parallel::ParIter
+    /// [`walk`]: crate::NodeRef::walk
+    /// [`walk_mut`]: crate::NodeMut::walk_mut
+    /// [`into_walk`]: crate::NodeMut::into_walk
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use orx_tree::*;
+    ///
+    /// let num_children = 4;
+    /// let total_depth = 10;
+    ///
+    /// let mut tree = DynTree::new(0.to_string());
+    /// let mut dfs = Traversal.dfs().over_nodes();
+    ///
+    /// for _ in 0..total_depth {
+    ///     let root = tree.root();
+    ///     let leaves: Vec<_> = root.leaves_with(&mut dfs).map(|x| x.idx()).collect();
+    ///     for idx in leaves {
+    ///         let count = tree.len();
+    ///         let mut node = tree.node_mut(&idx);
+    ///         for j in 0..num_children {
+    ///             node.push_child((count + j).to_string());
+    ///         }
+    ///     }
+    /// }
+    ///
+    /// let seq_result: usize = tree
+    ///     .clone()
+    ///     .into_iter()
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    ///
+    /// // compute in parallel with default configuration
+    /// let par_result = tree
+    ///     .clone()
+    ///     .into_par() // replace into_iter() with into_par()
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    /// assert_eq!(seq_result, par_result);
+    ///
+    /// // configure parallel computation
+    /// let par_result = tree
+    ///     .into_par()
+    ///     .num_threads(4)
+    ///     .chunk_size(64)
+    ///     .filter_map(|x| x.parse::<usize>().ok())
+    ///     .filter(|x| x % 2 == 0)
+    ///     .sum();
+    /// assert_eq!(seq_result, par_result);
+    /// ```
+    #[cfg(feature = "orx-parallel")]
+    pub fn into_par(self) -> impl orx_parallel::ParIter<Item = V::Item>
+    where
+        V::Item: Send + Sync + Clone,
+        <P as PinnedStorage>::PinnedVec<V>:
+            orx_concurrent_iter::IntoConcurrentIter<Item = crate::aliases::N<V>>,
+    {
+        use orx_parallel::*;
+        let (pinned, _, _) = self.0.into_inner().0.into_inner();
+        pinned.into_par().filter_map(|x| x.into_data())
+    }
+
     // helpers
 
     pub(crate) fn new_with_root(root_value: V::Item) -> Self