Skip to content

Commit a33e429

Browse files
committed
dnode_next_offset: backtrack if lower level does not match
This changes the basic search algorithm from a single search up and down the tree to a full depth-first traversal to handle conditions where the tree matches at a higher level but not a lower level. Normally higher level blocks always point to matching blocks, but there are cases where this does not happen: 1. Racing block pointer updates from dbuf_write_ready. Before f664f1e (#8946), both dbuf_write_ready and dnode_next_offset held dn_struct_rwlock which protected against pointer writes from concurrent syncs. This no longer applies, so sync context can f.e. clear or fill all L1->L0 BPs before the L2->L1 BP and higher BP's are updated. dnode_free_range in particular can reach this case and skip over L1 blocks that need to be dirtied. Later, sync will panic in free_children when trying to clear a non-dirty indirect block. This case was found with ztest. 2. txg > 0, non-hole case. This is #11196. Freeing blocks/dnodes breaks the assumption that a match at a higher level implies a match at a lower level when filtering txg > 0. Whenever some but not all L0 blocks are freed, the parent L1 block is rewritten. Its updated L2->L1 BP reflects a newer birth txg. Later when searching by txg, if the L1 block matches since the txg is newer, it is possible that none of the remaining L1->L0 BPs match if none have been updated. The same behavior is possible with dnode search at L0. This is reachable from dsl_destroy_head for synchronous freeing. When this happens open context fails to free objects leaving sync context stuck freeing potentially many objects. This is also reachable from traverse_pool for extreme rewind where it is theoretically possible that datasets not dirtied after txg are skipped if the MOS has high enough indirection to trigger this case. In both of these cases, without backtracking the search ends prematurely as ESRCH result implies no more matches in the entire object. Signed-off-by: Robert Evans <[email protected]>
1 parent 1acd246 commit a33e429

File tree

1 file changed

+102
-40
lines changed

1 file changed

+102
-40
lines changed

module/zfs/dnode.c

Lines changed: 102 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2528,13 +2528,18 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
25282528
* If we don't find what we are looking for in the block, we return ESRCH.
25292529
* Otherwise, return with *offset pointing to the beginning (if searching
25302530
* forwards) or end (if searching backwards) of the range covered by the
2531-
* block pointer we matched on (or dnode).
2531+
* block pointer we matched on (or dnode) but never less (or greater) than
2532+
* the starting offset.
25322533
*
2533-
* The basic search algorithm used below by dnode_next_offset() is to
2534-
* use this function to search up the block tree (widen the search) until
2535-
* we find something (i.e., we don't return ESRCH) and then search back
2536-
* down the tree (narrow the search) until we reach our original search
2537-
* level.
2534+
* For ESRCH, *offset is set to the first byte offset after (or before) the
2535+
* searched block unless the block is a hole or the resulting offset would
2536+
* underflow or overflow (in both cases the starting *offset is unchanged).
2537+
*
2538+
* The basic search algorithm used below by dnode_next_offset() uses this
2539+
* function to perform a block-order tree traversal. We search up the block
2540+
* tree (widen the search) until we find something (i.e., we don't return
2541+
* ESRCH) and then search back down the tree (narrow the search) until we
2542+
* reach our original search level or backtrack up because nothing matches.
25382543
*/
25392544
static int
25402545
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
@@ -2549,6 +2554,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25492554
int i, inc, error, span;
25502555

25512556
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2557+
ASSERT3U(dn->dn_nlevels, >, 0);
25522558

25532559
hole = ((flags & DNODE_FIND_HOLE) != 0);
25542560
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
@@ -2599,24 +2605,29 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25992605

26002606
ASSERT(dn->dn_type == DMU_OT_DNODE);
26012607
ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2608+
ASSERT3U(P2PHASE(*offset, DNODE_SHIFT), ==, 0);
2609+
ASSERT(ISP2(blkfill));
26022610

2603-
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2611+
for (i = P2PHASE(*offset >> DNODE_SHIFT, blkfill);
26042612
i < blkfill; i += dnp[i].dn_extra_slots + 1) {
26052613
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
26062614
break;
2615+
ASSERT3S(i + dnp[i].dn_extra_slots, <, blkfill);
26072616
}
26082617

2609-
if (i == blkfill)
2618+
if (i >= blkfill)
26102619
error = SET_ERROR(ESRCH);
26112620

2612-
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2621+
*offset = P2ALIGN_TYPED(*offset, DNODE_BLOCK_SIZE, uint64_t) +
26132622
(i << DNODE_SHIFT);
26142623
} else {
26152624
blkptr_t *bp = data;
2616-
uint64_t start = *offset;
2625+
uint64_t blkid, limit;
26172626
span = (lvl - 1) * epbs + dn->dn_datablkshift;
26182627
minfill = 0;
26192628
maxfill = blkfill << ((lvl - 1) * epbs);
2629+
ASSERT3S(span, >, 0);
2630+
ASSERT3U(maxfill, >, 0);
26202631

26212632
if (hole)
26222633
maxfill--;
@@ -2625,40 +2636,47 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
26252636

26262637
if (span >= 8 * sizeof (*offset)) {
26272638
/* This only happens on the highest indirection level */
2628-
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
2629-
*offset = 0;
2630-
} else {
2631-
*offset = *offset >> span;
2639+
ASSERT3U(lvl, ==, dn->dn_nlevels);
2640+
goto out;
26322641
}
26332642

2634-
for (i = BF64_GET(*offset, 0, epbs);
2643+
blkid = *offset >> span;
2644+
limit = 1ULL << (8 * sizeof (*offset) - span);
2645+
epb = MIN(epb, limit); /* don't overflow *offset */
2646+
ASSERT3U(P2ALIGN_TYPED(blkid, 1ULL << epbs, uint64_t) + epb,
2647+
<=, limit);
2648+
2649+
if (inc < 0 && lvl == dn->dn_nlevels)
2650+
blkid = MIN(epb - 1, blkid);
2651+
2652+
for (i = BF64_GET(blkid, 0, epbs);
26352653
i >= 0 && i < epb; i += inc) {
26362654
if (BP_GET_FILL(&bp[i]) >= minfill &&
26372655
BP_GET_FILL(&bp[i]) <= maxfill &&
26382656
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
26392657
break;
2640-
if (inc > 0 || *offset > 0)
2641-
*offset += inc;
2658+
if (inc > 0 || blkid > 0)
2659+
blkid += inc;
26422660
}
26432661

2644-
if (span >= 8 * sizeof (*offset)) {
2645-
*offset = start;
2646-
} else {
2647-
*offset = *offset << span;
2662+
ASSERT(i >= 0 || inc < 0);
2663+
ASSERT(blkid < limit || (inc > 0 && i >= epb));
2664+
2665+
/* set *offset unless matched same block or under/overflow */
2666+
if (blkid != (*offset >> span) && blkid < limit &&
2667+
(i >= 0 || blkid > 0)) {
2668+
/* position offset at end if traversing backwards */
2669+
uint64_t endoff = inc < 0 ? 1 : 0;
2670+
uint64_t result = ((blkid + endoff) << span) - endoff;
2671+
ASSERT(inc > 0 ? result > *offset : result < *offset);
2672+
*offset = result;
26482673
}
26492674

2650-
if (inc < 0) {
2651-
/* traversing backwards; position offset at the end */
2652-
if (span < 8 * sizeof (*offset))
2653-
*offset = MIN(*offset + (1ULL << span) - 1,
2654-
start);
2655-
} else if (*offset < start) {
2656-
*offset = start;
2657-
}
26582675
if (i < 0 || i >= epb)
26592676
error = SET_ERROR(ESRCH);
26602677
}
26612678

2679+
out:
26622680
if (db != NULL) {
26632681
rw_exit(&db->db_rwlock);
26642682
dbuf_rele(db, FTAG);
@@ -2667,6 +2685,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
26672685
return (error);
26682686
}
26692687

2688+
/*
2689+
* Adjust *offset to the next (or previous) block byte offset at lvl.
2690+
* Returns FALSE if *offset would overflow or underflow.
2691+
*/
2692+
static boolean_t
2693+
dnode_next_block(dnode_t *dn, boolean_t back, uint64_t *offset, int lvl)
2694+
{
2695+
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2696+
int span = lvl * epbs + dn->dn_datablkshift;
2697+
uint64_t blkid, limit;
2698+
2699+
if (span >= 8 * sizeof (uint64_t))
2700+
return (B_FALSE);
2701+
2702+
blkid = *offset >> span;
2703+
limit = 1ULL << (8 * sizeof (*offset) - span);
2704+
if (!back && blkid + 1 < limit)
2705+
*offset = (blkid + 1) << span;
2706+
else if (back && blkid > 0)
2707+
*offset = (blkid << span) - 1;
2708+
else
2709+
return (B_FALSE);
2710+
2711+
return (B_TRUE);
2712+
}
2713+
26702714
/*
26712715
* Find the next hole, data, or sparse region at or after *offset.
26722716
* The value 'blkfill' tells us how many items we expect to find
@@ -2694,9 +2738,10 @@ int
26942738
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
26952739
int minlvl, uint64_t blkfill, uint64_t txg)
26962740
{
2697-
uint64_t initial_offset = *offset;
2741+
uint64_t matched = *offset;
26982742
int lvl, maxlvl;
26992743
int error = 0;
2744+
boolean_t back = ((flags & DNODE_FIND_BACKWARDS) != 0);
27002745

27012746
if (!(flags & DNODE_FIND_HAVELOCK))
27022747
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -2718,16 +2763,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27182763

27192764
maxlvl = dn->dn_phys->dn_nlevels;
27202765

2721-
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
2766+
for (lvl = minlvl; lvl <= maxlvl; ) {
27222767
error = dnode_next_offset_level(dn,
27232768
flags, offset, lvl, blkfill, txg);
2724-
if (error != ESRCH)
2769+
if (error == 0 && lvl > minlvl) {
2770+
--lvl;
2771+
matched = *offset;
2772+
} else if (error == ESRCH && lvl < maxlvl &&
2773+
dnode_next_block(dn, back, &matched, lvl)) {
2774+
/*
2775+
* Continue search at next/prev offset in lvl+1 block.
2776+
*
2777+
* Usually we only search upwards at the start of the
2778+
* search as higher level blocks point at a matching
2779+
* minlvl block in most cases, but we backtrack if not.
2780+
*
2781+
* This can happen for txg > 0 searches if the block
2782+
* contains only BPs/dnodes freed at that txg. It also
2783+
* happens if we are still syncing out the tree, and
2784+
* some BP's at higher levels are not updated yet.
2785+
*
2786+
* We must adjust offset to avoid coming back to the
2787+
* same offset and getting stuck looping forever. This
2788+
* also deals with the case where offset is already at
2789+
* the beginning or end of the object.
2790+
*/
2791+
++lvl;
2792+
*offset = matched;
2793+
} else {
27252794
break;
2726-
}
2727-
2728-
while (error == 0 && --lvl >= minlvl) {
2729-
error = dnode_next_offset_level(dn,
2730-
flags, offset, lvl, blkfill, txg);
2795+
}
27312796
}
27322797

27332798
/*
@@ -2739,9 +2804,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27392804
error = 0;
27402805
}
27412806

2742-
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2743-
initial_offset < *offset : initial_offset > *offset))
2744-
error = SET_ERROR(ESRCH);
27452807
out:
27462808
if (!(flags & DNODE_FIND_HAVELOCK))
27472809
rw_exit(&dn->dn_struct_rwlock);

0 commit comments

Comments
 (0)