Skip to content

Commit 01cb1b9

Browse files
committed
btree: Implement faster binary search algorithm
This implements a binary search algorithm for B-Trees that reduces branching to the absolute minimum necessary for a binary search algorithm. It also enables the compiler to inlines the comparator to ensure that the only slowdown when doing binary search is from waiting for memory accesses. Consumers must opt into using the faster algorithm. Signed-off-by: Richard Yao <[email protected]>
1 parent 7381ddf commit 01cb1b9

File tree

7 files changed

+111
-15
lines changed

7 files changed

+111
-15
lines changed

include/sys/btree.h

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ typedef struct zfs_btree_index {
105105
boolean_t bti_before;
106106
} zfs_btree_index_t;
107107

108-
typedef struct btree {
108+
typedef struct btree zfs_btree_t;
109+
110+
struct btree {
109111
int (*bt_compar) (const void *, const void *);
112+
void * (*bt_find_in_buf) (zfs_btree_t *, uint8_t *, uint32_t,
113+
const void *, zfs_btree_index_t *);
110114
size_t bt_elem_size;
111115
size_t bt_leaf_size;
112116
uint32_t bt_leaf_cap;
@@ -115,7 +119,41 @@ typedef struct btree {
115119
uint64_t bt_num_nodes;
116120
zfs_btree_hdr_t *bt_root;
117121
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
118-
} zfs_btree_t;
122+
};
123+
124+
/*
125+
* Implementation of Shar's algorithm designed to accelerate binary search by
126+
* eliminating impossible to predict branches. For optimality, this should be
127+
* used to generate the search function in the same file as the comparator and
128+
* the comparator must be marked `__attribute__((always_inline) inline` so that
129+
* the compiler can inline the search function.
130+
*
131+
* Consumers must opt into using this function by setting ->bt_find_in_buf
132+
* after creating the btree.
133+
*/
134+
#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \
135+
static void * \
136+
NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \
137+
const void *value, zfs_btree_index_t *where) \
138+
{ \
139+
T *i = (T *)buf; \
140+
(void) tree; \
141+
while (nelems > 1) { \
142+
uint32_t half = nelems / 2; \
143+
nelems -= half; \
144+
i += (COMP(&i[half - 1], value) < 0) * half; \
145+
} \
146+
\
147+
int comp = COMP(i, value); \
148+
where->bti_offset = (i - (T *)buf) + (comp < 0); \
149+
where->bti_before = (comp != 0); \
150+
\
151+
if (comp == 0) { \
152+
return (i); \
153+
} \
154+
\
155+
return (NULL); \
156+
}
119157

120158
/*
121159
* Allocate and deallocate caches for btree nodes.

module/Kbuild.in

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ ifeq ($(CONFIG_KASAN),y)
3434
ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
3535
endif
3636

37+
# Generated binary search code is particularly bad with this optimization.
38+
# Oddly, range_tree.c is not affected.
39+
# Disable it on the other two until the following upstream issue is resolved:
40+
# https://github.com/llvm/llvm-project/issues/62790
41+
ifeq ($(CONFIG_X86_64),y)
42+
ifeq ($(CONFIG_CC_IS_CLANG),y)
43+
CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
44+
CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
45+
endif
46+
endif
47+
3748
ifneq ($(KBUILD_EXTMOD),)
3849
@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
3950
@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@

module/Makefile.bsd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,15 @@ beforeinstall:
400400

401401
.include <bsd.kmod.mk>
402402

403+
# Generated binary search code is particularly bad with this optimization.
404+
# Oddly, range_tree.c is not affected.
405+
# Disable it on the other two until the following upstream issue is resolved:
406+
# https://github.com/llvm/llvm-project/issues/62790
407+
.if ${CC} == "clang"
408+
CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false
409+
CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false
410+
.endif
411+
403412
CFLAGS.sysctl_os.c= -include ../zfs_config.h
404413
CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h
405414

module/zfs/btree.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
198198
zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
199199
}
200200

201+
static void *
202+
zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
203+
const void *value, zfs_btree_index_t *where);
204+
201205
void
202206
zfs_btree_create_custom(zfs_btree_t *tree,
203207
int (*compar) (const void *, const void *),
@@ -208,6 +212,7 @@ zfs_btree_create_custom(zfs_btree_t *tree,
208212
ASSERT3U(size, <=, esize / 2);
209213
memset(tree, 0, sizeof (*tree));
210214
tree->bt_compar = compar;
215+
tree->bt_find_in_buf = zfs_btree_find_in_buf;
211216
tree->bt_elem_size = size;
212217
tree->bt_leaf_size = lsize;
213218
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@@ -303,7 +308,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
303308
* element in the last leaf, it's in the last leaf or
304309
* it's not in the tree.
305310
*/
306-
void *d = zfs_btree_find_in_buf(tree,
311+
void *d = tree->bt_find_in_buf(tree,
307312
last_leaf->btl_elems +
308313
last_leaf->btl_hdr.bth_first * size,
309314
last_leaf->btl_hdr.bth_count, value, &idx);
@@ -327,7 +332,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
327332
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
328333
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
329334
ASSERT3P(node, !=, NULL);
330-
void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
335+
void *d = tree->bt_find_in_buf(tree, node->btc_elems,
331336
node->btc_hdr.bth_count, value, &idx);
332337
EQUIV(d != NULL, !idx.bti_before);
333338
if (d != NULL) {
@@ -347,7 +352,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
347352
*/
348353
zfs_btree_leaf_t *leaf = (depth == 0 ?
349354
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
350-
void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
355+
void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
351356
leaf->btl_hdr.bth_first * size,
352357
leaf->btl_hdr.bth_count, value, &idx);
353358

@@ -671,7 +676,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
671676
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
672677
zfs_btree_index_t idx;
673678
ASSERT(zfs_btree_is_core(par_hdr));
674-
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
679+
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
675680
par_hdr->bth_count, buf, &idx), ==, NULL);
676681
ASSERT(idx.bti_before);
677682
uint32_t offset = idx.bti_offset;
@@ -897,7 +902,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
897902
}
898903
zfs_btree_index_t idx;
899904
zfs_btree_core_t *parent = hdr->bth_parent;
900-
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
905+
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
901906
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
902907
ASSERT(idx.bti_before);
903908
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);

module/zfs/dsl_scan.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4195,6 +4195,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
41954195
* with single operation. Plus it makes scrubs more sequential and reduces
41964196
* chances that minor extent change move it within the B-tree.
41974197
*/
4198+
__attribute__((always_inline)) inline
41984199
static int
41994200
ext_size_compare(const void *x, const void *y)
42004201
{
@@ -4203,13 +4204,17 @@ ext_size_compare(const void *x, const void *y)
42034204
return (TREE_CMP(*a, *b));
42044205
}
42054206

4207+
ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
4208+
ext_size_compare)
4209+
42064210
static void
42074211
ext_size_create(range_tree_t *rt, void *arg)
42084212
{
42094213
(void) rt;
42104214
zfs_btree_t *size_tree = arg;
42114215

42124216
zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
4217+
size_tree->bt_find_in_buf = ext_size_find_in_buf;
42134218
}
42144219

42154220
static void

module/zfs/metaslab.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
13421342
* Comparison function for the private size-ordered tree using 32-bit
13431343
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
13441344
*/
1345+
__attribute__((always_inline)) inline
13451346
static int
13461347
metaslab_rangesize32_compare(const void *x1, const void *x2)
13471348
{
@@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
13521353
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
13531354

13541355
int cmp = TREE_CMP(rs_size1, rs_size2);
1355-
if (likely(cmp))
1356-
return (cmp);
13571356

1358-
return (TREE_CMP(r1->rs_start, r2->rs_start));
1357+
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
13591358
}
13601359

13611360
/*
13621361
* Comparison function for the private size-ordered tree using 64-bit
13631362
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
13641363
*/
1364+
__attribute__((always_inline)) inline
13651365
static int
13661366
metaslab_rangesize64_compare(const void *x1, const void *x2)
13671367
{
@@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
13721372
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
13731373

13741374
int cmp = TREE_CMP(rs_size1, rs_size2);
1375-
if (likely(cmp))
1376-
return (cmp);
13771375

1378-
return (TREE_CMP(r1->rs_start, r2->rs_start));
1376+
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
13791377
}
1378+
13801379
typedef struct metaslab_rt_arg {
13811380
zfs_btree_t *mra_bt;
13821381
uint32_t mra_floor_shift;
@@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt)
14121411
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
14131412
}
14141413

1414+
1415+
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
1416+
range_seg32_t, metaslab_rangesize32_compare)
1417+
1418+
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
1419+
range_seg64_t, metaslab_rangesize64_compare)
1420+
14151421
/*
14161422
* Create any block allocator specific components. The current allocators
14171423
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@@ -1428,15 +1434,20 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
14281434
case RANGE_SEG32:
14291435
size = sizeof (range_seg32_t);
14301436
compare = metaslab_rangesize32_compare;
1437+
zfs_btree_create(size_tree, compare, size);
1438+
size_tree->bt_find_in_buf =
1439+
metaslab_rt_find_rangesize32_in_buf;
14311440
break;
14321441
case RANGE_SEG64:
14331442
size = sizeof (range_seg64_t);
14341443
compare = metaslab_rangesize64_compare;
1444+
zfs_btree_create(size_tree, compare, size);
1445+
size_tree->bt_find_in_buf =
1446+
metaslab_rt_find_rangesize64_in_buf;
14351447
break;
14361448
default:
14371449
panic("Invalid range seg type %d", rt->rt_type);
14381450
}
1439-
zfs_btree_create(size_tree, compare, size);
14401451
mrap->mra_floor_shift = metaslab_by_size_min_shift;
14411452
}
14421453

module/zfs/range_tree.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
151151
rt->rt_histogram[idx]--;
152152
}
153153

154+
__attribute__((always_inline)) inline
154155
static int
155156
range_tree_seg32_compare(const void *x1, const void *x2)
156157
{
@@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
163164
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
164165
}
165166

167+
__attribute__((always_inline)) inline
166168
static int
167169
range_tree_seg64_compare(const void *x1, const void *x2)
168170
{
@@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
175177
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
176178
}
177179

180+
__attribute__((always_inline)) inline
178181
static int
179182
range_tree_seg_gap_compare(const void *x1, const void *x2)
180183
{
@@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
187190
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
188191
}
189192

193+
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
194+
range_tree_seg32_compare)
195+
196+
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
197+
range_tree_seg64_compare)
198+
199+
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
200+
range_tree_seg_gap_compare)
201+
190202
range_tree_t *
191203
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
192204
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
@@ -201,19 +213,24 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
201213
case RANGE_SEG32:
202214
size = sizeof (range_seg32_t);
203215
compare = range_tree_seg32_compare;
216+
zfs_btree_create(&rt->rt_root, compare, size);
217+
rt->rt_root.bt_find_in_buf = range_tree_seg32_find_in_buf;
204218
break;
205219
case RANGE_SEG64:
206220
size = sizeof (range_seg64_t);
207221
compare = range_tree_seg64_compare;
222+
zfs_btree_create(&rt->rt_root, compare, size);
223+
rt->rt_root.bt_find_in_buf = range_tree_seg64_find_in_buf;
208224
break;
209225
case RANGE_SEG_GAP:
210226
size = sizeof (range_seg_gap_t);
211227
compare = range_tree_seg_gap_compare;
228+
zfs_btree_create(&rt->rt_root, compare, size);
229+
rt->rt_root.bt_find_in_buf = range_tree_seg_gap_find_in_buf;
212230
break;
213231
default:
214232
panic("Invalid range seg type %d", type);
215233
}
216-
zfs_btree_create(&rt->rt_root, compare, size);
217234

218235
rt->rt_ops = ops;
219236
rt->rt_gap = gap;

0 commit comments

Comments
 (0)