Skip to content

Commit 0fe0b07

Browse files
committed
dnode_next_offset: backtrack if lower level does not match
This changes the basic search algorithm from a single search up and down the tree to a full depth-first traversal to handle conditions where the tree matches at a higher level but not a lower level. Normally higher level blocks always point to matching blocks, but there are cases where this does not happen: 1. Racing block pointer updates from dbuf_write_ready. Before f664f1e (#8946), both dbuf_write_ready and dnode_next_offset held dn_struct_rwlock which protected against pointer writes from concurrent syncs. This no longer applies, so sync context can f.e. clear or fill all L1->L0 BPs before the L2->L1 BP and higher BP's are updated. dnode_free_range in particular can reach this case and skip over L1 blocks that need to be dirtied. Later, sync will panic in free_children when trying to clear a non-dirty indirect block. This case was found with ztest. 2. txg > 0, non-hole case. This is #11196. Freeing blocks/dnodes breaks the assumption that a match at a higher level implies a match at a lower level when filtering txg > 0. Whenever some but not all L0 blocks are freed, the parent L1 block is rewritten. Its updated L2->L1 BP reflects a newer birth txg. Later when searching by txg, if the L1 block matches since the txg is newer, it is possible that none of the remaining L1->L0 BPs match if none have been updated. The same behavior is possible with dnode search at L0. This is reachable from dsl_destroy_head for synchronous freeing. When this happens open context fails to free objects leaving sync context stuck freeing potentially many objects. This is also reachable from traverse_pool for extreme rewind where it is theoretically possible that datasets not dirtied after txg are skipped if the MOS has high enough indirection to trigger this case. In both of these cases, without backtracking the search ends prematurely as ESRCH result implies no more matches in the entire object. Signed-off-by: Robert Evans <[email protected]>
1 parent c28f94f commit 0fe0b07

File tree

1 file changed

+101
-40
lines changed

1 file changed

+101
-40
lines changed

module/zfs/dnode.c

Lines changed: 101 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2498,13 +2498,18 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
24982498
* If we don't find what we are looking for in the block, we return ESRCH.
24992499
* Otherwise, return with *offset pointing to the beginning (if searching
25002500
* forwards) or end (if searching backwards) of the range covered by the
2501-
* block pointer we matched on (or dnode).
2501+
* block pointer we matched on (or dnode) but never less (or greater) than
2502+
* the starting offset.
25022503
*
2503-
* The basic search algorithm used below by dnode_next_offset() is to
2504-
* use this function to search up the block tree (widen the search) until
2505-
* we find something (i.e., we don't return ESRCH) and then search back
2506-
* down the tree (narrow the search) until we reach our original search
2507-
* level.
2504+
* For ESRCH, *offset is set to the first byte offset after (or before) the
2505+
* searched block unless the block is a hole or the resulting offset would
2506+
* underflow or overflow (in both cases the starting *offset is unchanged).
2507+
*
2508+
* The basic search algorithm used below by dnode_next_offset() uses this
2509+
* function to perform a block-order tree traversal. We search up the block
2510+
* tree (widen the search) until we find something (i.e., we don't return
2511+
* ESRCH) and then search back down the tree (narrow the search) until we
2512+
* reach our original search level or backtrack up because nothing matches.
25082513
*/
25092514
static int
25102515
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
@@ -2519,6 +2524,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25192524
int i, inc, error, span;
25202525

25212526
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2527+
ASSERT3U(dn->dn_nlevels, >, 0);
25222528

25232529
hole = ((flags & DNODE_FIND_HOLE) != 0);
25242530
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
@@ -2569,24 +2575,29 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25692575

25702576
ASSERT(dn->dn_type == DMU_OT_DNODE);
25712577
ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2578+
ASSERT3U(P2PHASE(*offset, DNODE_SHIFT), ==, 0);
2579+
ASSERT(ISP2(blkfill));
25722580

2573-
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2581+
for (i = P2PHASE(*offset >> DNODE_SHIFT, blkfill);
25742582
i < blkfill; i += dnp[i].dn_extra_slots + 1) {
25752583
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
25762584
break;
2585+
ASSERT3S(i + dnp[i].dn_extra_slots, <, blkfill);
25772586
}
25782587

2579-
if (i == blkfill)
2588+
if (i >= blkfill)
25802589
error = SET_ERROR(ESRCH);
25812590

2582-
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2591+
*offset = P2ALIGN(*offset, DNODE_BLOCK_SIZE) +
25832592
(i << DNODE_SHIFT);
25842593
} else {
25852594
blkptr_t *bp = data;
2586-
uint64_t start = *offset;
2595+
uint64_t blkid, limit;
25872596
span = (lvl - 1) * epbs + dn->dn_datablkshift;
25882597
minfill = 0;
25892598
maxfill = blkfill << ((lvl - 1) * epbs);
2599+
ASSERT3S(span, >, 0);
2600+
ASSERT3U(maxfill, >, 0);
25902601

25912602
if (hole)
25922603
maxfill--;
@@ -2595,40 +2606,46 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25952606

25962607
if (span >= 8 * sizeof (*offset)) {
25972608
/* This only happens on the highest indirection level */
2598-
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
2599-
*offset = 0;
2600-
} else {
2601-
*offset = *offset >> span;
2609+
ASSERT3U(lvl, ==, dn->dn_nlevels);
2610+
goto out;
26022611
}
26032612

2604-
for (i = BF64_GET(*offset, 0, epbs);
2613+
blkid = *offset >> span;
2614+
limit = 1ULL << (8 * sizeof (*offset) - span);
2615+
epb = MIN(epb, limit); /* don't overflow *offset */
2616+
ASSERT3U(P2ALIGN(blkid, 1ULL << epbs) + epb, <=, limit);
2617+
2618+
if (inc < 0 && lvl == dn->dn_nlevels)
2619+
blkid = MIN(epb - 1, blkid);
2620+
2621+
for (i = BF64_GET(blkid, 0, epbs);
26052622
i >= 0 && i < epb; i += inc) {
26062623
if (BP_GET_FILL(&bp[i]) >= minfill &&
26072624
BP_GET_FILL(&bp[i]) <= maxfill &&
26082625
(hole || bp[i].blk_birth > txg))
26092626
break;
2610-
if (inc > 0 || *offset > 0)
2611-
*offset += inc;
2627+
if (inc > 0 || blkid > 0)
2628+
blkid += inc;
26122629
}
26132630

2614-
if (span >= 8 * sizeof (*offset)) {
2615-
*offset = start;
2616-
} else {
2617-
*offset = *offset << span;
2631+
ASSERT(i >= 0 || inc < 0);
2632+
ASSERT(blkid < limit || (inc > 0 && i >= epb));
2633+
2634+
/* set *offset unless matched same block or under/overflow */
2635+
if (blkid != (*offset >> span) && blkid < limit &&
2636+
(i >= 0 || blkid > 0)) {
2637+
/* position offset at end if traversing backwards */
2638+
uint64_t endoff = inc < 0 ? 1 : 0;
2639+
uint64_t result = ((blkid + endoff) << span) - endoff;
2640+
ASSERT(inc > 0 ? result > *offset : result < *offset);
2641+
*offset = result;
26182642
}
26192643

2620-
if (inc < 0) {
2621-
/* traversing backwards; position offset at the end */
2622-
if (span < 8 * sizeof (*offset))
2623-
*offset = MIN(*offset + (1ULL << span) - 1,
2624-
start);
2625-
} else if (*offset < start) {
2626-
*offset = start;
2627-
}
26282644
if (i < 0 || i >= epb)
26292645
error = SET_ERROR(ESRCH);
26302646
}
26312647

2648+
out:
26322649
if (db != NULL) {
26332650
rw_exit(&db->db_rwlock);
26342651
dbuf_rele(db, FTAG);
@@ -2637,6 +2654,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
26372654
return (error);
26382655
}
26392656

2657+
/*
2658+
* Adjust *offset to the next (or previous) block byte offset at lvl.
2659+
* Returns FALSE if *offset would overflow or underflow.
2660+
*/
2661+
static boolean_t
2662+
dnode_next_block(dnode_t *dn, boolean_t back, uint64_t *offset, int lvl)
2663+
{
2664+
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2665+
int span = lvl * epbs + dn->dn_datablkshift;
2666+
uint64_t blkid, limit;
2667+
2668+
if (span >= 8 * sizeof (uint64_t))
2669+
return (B_FALSE);
2670+
2671+
blkid = *offset >> span;
2672+
limit = 1ULL << (8 * sizeof (*offset) - span);
2673+
if (!back && blkid + 1 < limit)
2674+
*offset = (blkid + 1) << span;
2675+
else if (back && blkid > 0)
2676+
*offset = (blkid << span) - 1;
2677+
else
2678+
return (B_FALSE);
2679+
2680+
return (B_TRUE);
2681+
}
2682+
26402683
/*
26412684
* Find the next hole, data, or sparse region at or after *offset.
26422685
* The value 'blkfill' tells us how many items we expect to find
@@ -2664,9 +2707,10 @@ int
26642707
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
26652708
int minlvl, uint64_t blkfill, uint64_t txg)
26662709
{
2667-
uint64_t initial_offset = *offset;
2710+
uint64_t matched = *offset;
26682711
int lvl, maxlvl;
26692712
int error = 0;
2713+
boolean_t back = ((flags & DNODE_FIND_BACKWARDS) != 0);
26702714

26712715
if (!(flags & DNODE_FIND_HAVELOCK))
26722716
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -2688,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
26882732

26892733
maxlvl = dn->dn_phys->dn_nlevels;
26902734

2691-
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
2735+
for (lvl = minlvl; lvl <= maxlvl; ) {
26922736
error = dnode_next_offset_level(dn,
26932737
flags, offset, lvl, blkfill, txg);
2694-
if (error != ESRCH)
2738+
if (error == 0 && lvl > minlvl) {
2739+
--lvl;
2740+
matched = *offset;
2741+
} else if (error == ESRCH && lvl < maxlvl &&
2742+
dnode_next_block(dn, back, &matched, lvl)) {
2743+
/*
2744+
* Continue search at next/prev offset in lvl+1 block.
2745+
*
2746+
* Usually we only search upwards at the start of the
2747+
* search as higher level blocks point at a matching
2748+
* minlvl block in most cases, but we backtrack if not.
2749+
*
2750+
* This can happen for txg > 0 searches if the block
2751+
* contains only BPs/dnodes freed at that txg. It also
2752+
* happens if we are still syncing out the tree, and
2753+
* some BP's at higher levels are not updated yet.
2754+
*
2755+
* We must adjust offset to avoid coming back to the
2756+
* same offset and getting stuck looping forever. This
2757+
* also deals with the case where offset is already at
2758+
* the beginning or end of the object.
2759+
*/
2760+
++lvl;
2761+
*offset = matched;
2762+
} else {
26952763
break;
2696-
}
2697-
2698-
while (error == 0 && --lvl >= minlvl) {
2699-
error = dnode_next_offset_level(dn,
2700-
flags, offset, lvl, blkfill, txg);
2764+
}
27012765
}
27022766

27032767
/*
@@ -2709,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27092773
error = 0;
27102774
}
27112775

2712-
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2713-
initial_offset < *offset : initial_offset > *offset))
2714-
error = SET_ERROR(ESRCH);
27152776
out:
27162777
if (!(flags & DNODE_FIND_HAVELOCK))
27172778
rw_exit(&dn->dn_struct_rwlock);

0 commit comments

Comments
 (0)