Skip to content

Commit 706d1fb

Browse files
committed
btree: Implement faster binary search algorithm
This implements a binary search algorithm for B-Trees that reduces branching to the absolute minimum necessary for a binary search algorithm. It also enables the compiler to inline the comparator to ensure that the only slowdown when doing binary search is from waiting for memory accesses. Additionally, it instructs the compiler to unroll the loop, which gives an additional 40% improve with Clang and 8% improvement with GCC. Consumers must opt into using the faster algorithm. At present, only B-Trees used inside kernel code have been modified to use the faster algorithm. Micro-benchmarks suggest that this can improve binary search performance by up to 3.5 times when compiling with Clang 16 and up to 1.9 times when compiling with GCC 12.2. Signed-off-by: Richard Yao <[email protected]>
1 parent 79b2094 commit 706d1fb

File tree

10 files changed

+154
-26
lines changed

10 files changed

+154
-26
lines changed

cmd/zdb/zdb.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
326326
int err;
327327
struct sublivelist_verify *sv = args;
328328

329-
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
329+
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
330330
sizeof (sublivelist_verify_block_refcnt_t));
331331

332332
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
@@ -390,7 +390,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
390390
{
391391
(void) args;
392392
sublivelist_verify_t sv;
393-
zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
393+
zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
394394
sizeof (sublivelist_verify_block_t));
395395
int err = sublivelist_verify_func(&sv, dle);
396396
zfs_btree_clear(&sv.sv_leftover);
@@ -682,7 +682,7 @@ livelist_metaslab_validate(spa_t *spa)
682682
(void) printf("Verifying deleted livelist entries\n");
683683

684684
sublivelist_verify_t sv;
685-
zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
685+
zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
686686
sizeof (sublivelist_verify_block_t));
687687
iterate_deleted_livelists(spa, livelist_verify, &sv);
688688

@@ -716,7 +716,7 @@ livelist_metaslab_validate(spa_t *spa)
716716
mv.mv_start = m->ms_start;
717717
mv.mv_end = m->ms_start + m->ms_size;
718718
zfs_btree_create(&mv.mv_livelist_allocs,
719-
livelist_block_compare,
719+
livelist_block_compare, NULL,
720720
sizeof (sublivelist_verify_block_t));
721721

722722
mv_populate_livelist_allocs(&mv, &sv);

include/sys/btree.h

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,13 @@ typedef struct zfs_btree_index {
105105
boolean_t bti_before;
106106
} zfs_btree_index_t;
107107

108-
typedef struct btree {
108+
typedef struct btree zfs_btree_t;
109+
typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
110+
const void *, zfs_btree_index_t *);
111+
112+
struct btree {
109113
int (*bt_compar) (const void *, const void *);
114+
bt_find_in_buf_f bt_find_in_buf;
110115
size_t bt_elem_size;
111116
size_t bt_leaf_size;
112117
uint32_t bt_leaf_cap;
@@ -115,7 +120,54 @@ typedef struct btree {
115120
uint64_t bt_num_nodes;
116121
zfs_btree_hdr_t *bt_root;
117122
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
118-
} zfs_btree_t;
123+
};
124+
125+
/*
126+
* Implementation of Shar's algorithm designed to accelerate binary search by
127+
* eliminating impossible to predict branches.
128+
*
129+
* For optimality, this should be used to generate the search function in the
130+
* same file as the comparator and the comparator should be marked
131+
* `__attribute__((always_inline) inline` so that the compiler will inline it.
132+
*
133+
* Arguments are:
134+
*
135+
* NAME - The function name for this instance of the search function. Use it
136+
* in a subsequent call to zfs_btree_create().
137+
* T - The element type stored inside the B-Tree.
138+
* COMP - A comparator to compare two nodes, it must return exactly: -1, 0,
139+
* or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
140+
* TREE_CMP() from avl.h can be used in a boilerplate function.
141+
*/
142+
/* BEGIN CSTYLED */
143+
#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \
144+
_Pragma("GCC diagnostic push") \
145+
_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \
146+
static void * \
147+
NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \
148+
const void *value, zfs_btree_index_t *where) \
149+
{ \
150+
T *i = (T *)buf; \
151+
(void) tree; \
152+
_Pragma("GCC unroll 9") \
153+
while (nelems > 1) { \
154+
uint32_t half = nelems / 2; \
155+
nelems -= half; \
156+
i += (COMP(&i[half - 1], value) < 0) * half; \
157+
} \
158+
\
159+
int comp = COMP(i, value); \
160+
where->bti_offset = (i - (T *)buf) + (comp < 0); \
161+
where->bti_before = (comp != 0); \
162+
\
163+
if (comp == 0) { \
164+
return (i); \
165+
} \
166+
\
167+
return (NULL); \
168+
} \
169+
_Pragma("GCC diagnostic pop")
170+
/* END CSTYLED */
119171

120172
/*
121173
* Allocate and deallocate caches for btree nodes.
@@ -129,13 +181,19 @@ void zfs_btree_fini(void);
129181
* tree - the tree to be initialized
130182
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
131183
* -1 for <, 0 for ==, and +1 for >
184+
* find - optional function to accelerate searches inside B-Tree nodes
185+
* through Shar's algorithm and comparator inlining. Setting this to
186+
* NULL will use a generic function. The function should be created
187+
* using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
188+
* compar should be marked `__attribute__((always_inline)) inline` or
189+
* performance is unlikely to improve very much.
132190
* size - the value of sizeof(struct my_type)
133191
* lsize - custom leaf size
134192
*/
135193
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
136-
size_t);
194+
bt_find_in_buf_f, size_t);
137195
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
138-
size_t, size_t);
196+
bt_find_in_buf_f, size_t, size_t);
139197

140198
/*
141199
* Find a node with a matching value in the tree. Returns the matching node

module/Kbuild.in

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y)
3434
ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
3535
endif
3636

37+
# Generated binary search code is particularly bad with this optimization.
38+
# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
39+
# is not affected when unrolling is done.
40+
# Disable it until the following upstream issue is resolved:
41+
# https://github.com/llvm/llvm-project/issues/62790
42+
ifeq ($(CONFIG_X86),y)
43+
ifeq ($(CONFIG_CC_IS_CLANG),y)
44+
CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
45+
CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
46+
CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
47+
CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
48+
endif
49+
endif
50+
3751
ifneq ($(KBUILD_EXTMOD),)
3852
@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
3953
@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@

module/Makefile.bsd

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,20 @@ beforeinstall:
400400

401401
.include <bsd.kmod.mk>
402402

403+
# Generated binary search code is particularly bad with this optimization.
404+
# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
405+
# is not affected when unrolling is done.
406+
# Disable it until the following upstream issue is resolved:
407+
# https://github.com/llvm/llvm-project/issues/62790
408+
.if ${CC} == "clang"
409+
.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64"
410+
CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false
411+
CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false
412+
CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false
413+
CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false
414+
.endif
415+
.endif
416+
403417
CFLAGS.sysctl_os.c= -include ../zfs_config.h
404418
CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h
405419

module/zfs/btree.c

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -193,21 +193,29 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
193193

194194
void
195195
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
196-
size_t size)
196+
bt_find_in_buf_f bt_find_in_buf, size_t size)
197197
{
198-
zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
198+
zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
199+
BTREE_LEAF_SIZE);
199200
}
200201

202+
static void *
203+
zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
204+
const void *value, zfs_btree_index_t *where);
205+
201206
void
202207
zfs_btree_create_custom(zfs_btree_t *tree,
203208
int (*compar) (const void *, const void *),
209+
bt_find_in_buf_f bt_find_in_buf,
204210
size_t size, size_t lsize)
205211
{
206212
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
207213

208214
ASSERT3U(size, <=, esize / 2);
209215
memset(tree, 0, sizeof (*tree));
210216
tree->bt_compar = compar;
217+
tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
218+
zfs_btree_find_in_buf : bt_find_in_buf;
211219
tree->bt_elem_size = size;
212220
tree->bt_leaf_size = lsize;
213221
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
303311
* element in the last leaf, it's in the last leaf or
304312
* it's not in the tree.
305313
*/
306-
void *d = zfs_btree_find_in_buf(tree,
314+
void *d = tree->bt_find_in_buf(tree,
307315
last_leaf->btl_elems +
308316
last_leaf->btl_hdr.bth_first * size,
309317
last_leaf->btl_hdr.bth_count, value, &idx);
@@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
327335
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
328336
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
329337
ASSERT3P(node, !=, NULL);
330-
void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
338+
void *d = tree->bt_find_in_buf(tree, node->btc_elems,
331339
node->btc_hdr.bth_count, value, &idx);
332340
EQUIV(d != NULL, !idx.bti_before);
333341
if (d != NULL) {
@@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
347355
*/
348356
zfs_btree_leaf_t *leaf = (depth == 0 ?
349357
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
350-
void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
358+
void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
351359
leaf->btl_hdr.bth_first * size,
352360
leaf->btl_hdr.bth_count, value, &idx);
353361

@@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
671679
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
672680
zfs_btree_index_t idx;
673681
ASSERT(zfs_btree_is_core(par_hdr));
674-
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
682+
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
675683
par_hdr->bth_count, buf, &idx), ==, NULL);
676684
ASSERT(idx.bti_before);
677685
uint32_t offset = idx.bti_offset;
@@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
897905
}
898906
zfs_btree_index_t idx;
899907
zfs_btree_core_t *parent = hdr->bth_parent;
900-
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
908+
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
901909
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
902910
ASSERT(idx.bti_before);
903911
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);

module/zfs/dsl_scan.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
48774877
* with single operation. Plus it makes scrubs more sequential and reduces
48784878
* chances that minor extent change move it within the B-tree.
48794879
*/
4880+
__attribute__((always_inline)) inline
48804881
static int
48814882
ext_size_compare(const void *x, const void *y)
48824883
{
@@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y)
48854886
return (TREE_CMP(*a, *b));
48864887
}
48874888

4889+
ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
4890+
ext_size_compare)
4891+
48884892
static void
48894893
ext_size_create(range_tree_t *rt, void *arg)
48904894
{
48914895
(void) rt;
48924896
zfs_btree_t *size_tree = arg;
48934897

4894-
zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
4898+
zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
4899+
sizeof (uint64_t));
48954900
}
48964901

48974902
static void

module/zfs/metaslab.c

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
13421342
* Comparison function for the private size-ordered tree using 32-bit
13431343
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
13441344
*/
1345+
__attribute__((always_inline)) inline
13451346
static int
13461347
metaslab_rangesize32_compare(const void *x1, const void *x2)
13471348
{
@@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
13521353
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
13531354

13541355
int cmp = TREE_CMP(rs_size1, rs_size2);
1355-
if (likely(cmp))
1356-
return (cmp);
13571356

1358-
return (TREE_CMP(r1->rs_start, r2->rs_start));
1357+
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
13591358
}
13601359

13611360
/*
13621361
* Comparison function for the private size-ordered tree using 64-bit
13631362
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
13641363
*/
1364+
__attribute__((always_inline)) inline
13651365
static int
13661366
metaslab_rangesize64_compare(const void *x1, const void *x2)
13671367
{
@@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
13721372
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
13731373

13741374
int cmp = TREE_CMP(rs_size1, rs_size2);
1375-
if (likely(cmp))
1376-
return (cmp);
13771375

1378-
return (TREE_CMP(r1->rs_start, r2->rs_start));
1376+
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
13791377
}
1378+
13801379
typedef struct metaslab_rt_arg {
13811380
zfs_btree_t *mra_bt;
13821381
uint32_t mra_floor_shift;
@@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt)
14121411
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
14131412
}
14141413

1414+
1415+
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
1416+
range_seg32_t, metaslab_rangesize32_compare)
1417+
1418+
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
1419+
range_seg64_t, metaslab_rangesize64_compare)
1420+
14151421
/*
14161422
* Create any block allocator specific components. The current allocators
14171423
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
14241430

14251431
size_t size;
14261432
int (*compare) (const void *, const void *);
1433+
bt_find_in_buf_f bt_find;
14271434
switch (rt->rt_type) {
14281435
case RANGE_SEG32:
14291436
size = sizeof (range_seg32_t);
14301437
compare = metaslab_rangesize32_compare;
1438+
bt_find = metaslab_rt_find_rangesize32_in_buf;
14311439
break;
14321440
case RANGE_SEG64:
14331441
size = sizeof (range_seg64_t);
14341442
compare = metaslab_rangesize64_compare;
1443+
bt_find = metaslab_rt_find_rangesize64_in_buf;
14351444
break;
14361445
default:
14371446
panic("Invalid range seg type %d", rt->rt_type);
14381447
}
1439-
zfs_btree_create(size_tree, compare, size);
1448+
zfs_btree_create(size_tree, compare, bt_find, size);
14401449
mrap->mra_floor_shift = metaslab_by_size_min_shift;
14411450
}
14421451

0 commit comments

Comments
 (0)