diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 6ce995d0a086..d826c7e1f0c0 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -77,11 +77,14 @@ typedef enum trace_alloc_type { #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) -#define METASLAB_WEIGHT_TYPE (1ULL << 60) +#define METASLAB_WEIGHT_MASK ((1ULL << 60) | 1ULL << 59) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) +#define METASLAB_WEIGHT_MAX_IDX 58 +#define METASLAB_WEIGHT_MAX ((1ULL << (METASLAB_WEIGHT_MAX_IDX + 1)) - 1) + /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the @@ -103,18 +106,30 @@ typedef enum trace_alloc_type { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC1| weighted-free space | + * |PSC10| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * + * Space-based weight v2: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC11| weighted-free space | idx | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * idx - index for the highest bucket in the histogram + * space - the fragmentation-weighted space + * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC0| idx| count of segments in region | + * |PSC00| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation @@ -125,17 +140,22 @@ typedef enum trace_alloc_type { #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) +#define WEIGHT_GET_TYPE(weight) BF64_GET((weight), 59, 2) +#define WEIGHT_SET_TYPE(weight, x) BF64_SET((weight), 59, 2, x) #define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 60, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) + ((weight) == 0 || WEIGHT_GET_TYPE((weight))) +#define WEIGHT_SET_SPACEBASED(weight) WEIGHT_SET_TYPE((weight), 2) +#define WEIGHT_IS_SPACEBASED_V2(weight) \ + ((weight) == 0 || WEIGHT_GET_TYPE((weight)) == 3) +#define WEIGHT_SET_SPACEBASED_V2(weight) WEIGHT_SET_TYPE((weight), 3) /* * These macros are only applicable to segment-based weighting. */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 53, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 53, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 53) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 53, x) /* * Per-allocator data structure. diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 60ec56b4d1f6..c47fc7c70a21 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -326,6 +326,14 @@ Prevent log spacemaps from being destroyed during pool exports and destroys. .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable segment-based metaslab selection. . +.It Sy zfs_metaslab_space_weight_v2_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable/disable the new space-based metaslab selection. +.Pp +Note that this algorithm is only enabled if +.Sy zfs_metaslab_segment_weight_enabled +is set to false. +It also requires space map histograms to be enabled. +. .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int When using segment-based metaslab selection, continue allocating from the active metaslab until this option's diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3f649ffb44e4..70007b65885b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -258,6 +258,16 @@ static const boolean_t zfs_remap_blkptr_enable = B_TRUE; */ static int zfs_metaslab_segment_weight_enabled = B_TRUE; +/* + * Enable/disable the new space-based metaslab selection algorithm. + * + * The new space-based algorithm attempts to take into account not only the + * largest free segment, as the segment-based weight does, but other segments + * that are almost as large. This can improve metaslab selection and reduce the + * number of metaslab loads needed to satisfy a given set of allocations. + */ +static int zfs_metaslab_space_weight_v2_enabled = B_TRUE; + /* * When using segment-based metaslab selection, we will continue * allocating from the active metaslab until we have exhausted @@ -2339,7 +2349,7 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint8_t type = WEIGHT_GET_TYPE(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; @@ -2367,8 +2377,7 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ - if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || - (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + if (type != WEIGHT_GET_TYPE(msp->ms_weight)) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; @@ -2753,6 +2762,7 @@ metaslab_unload(metaslab_t *msp) return; zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); + msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); @@ -3072,6 +3082,152 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } +/* + * Return the weight of the specified metaslab, according to the new space-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_space_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint8_t vd_shift = msp->ms_group->mg_vd->vdev_ashift; + ASSERT3U(vd_shift, >=, 3); + + ASSERT(msp->ms_loaded); + ASSERT3U(vd_shift, >=, SPA_MINBLOCKSHIFT); + + for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= vd_shift; + i--) { + uint8_t seg_shift = 2 * (i - (vd_shift - 3)); + uint64_t segments = msp->ms_allocatable->rt_histogram[i]; + if (segments == 0) + continue; + if (weight == 0) + weight = i; + // Prevent overflow using log_2 math + if (seg_shift + highbit64(segments) > METASLAB_WEIGHT_MAX_IDX) + return (METASLAB_WEIGHT_MAX); + weight = MIN(METASLAB_WEIGHT_MAX, + weight + (segments << seg_shift)); + } + return (weight); +} + +/* + * Calculate the new space-based weight based on the on-disk histogram. + * Should be applied only to unloaded metaslabs (i.e no incoming allocations) + * in-order to give results consistent with the on-disk state. + */ +static uint64_t +metaslab_space_weight_from_spacemap(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + uint8_t vd_shift = msp->ms_group->mg_vd->vdev_ashift; + ASSERT3U(vd_shift, >=, 3); + + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + uint8_t seg_shift = 2 * (i + sm->sm_shift - vd_shift + 3); + uint64_t segments = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (segments == 0) + continue; + if (weight == 0) + weight = i + sm->sm_shift; + // Prevent overflow using log_2 math + if (seg_shift + highbit64(segments) > METASLAB_WEIGHT_MAX_IDX) + return (METASLAB_WEIGHT_MAX); + weight = MIN(METASLAB_WEIGHT_MAX, + weight + (segments << seg_shift)); + } + return (weight); +} + +/* + * The space weight v2 algorithm uses information from the free space + * histograms to provide a more useful weighting of the free space in + * the metaslab. Rather than simply using the fragmentation metric, we + * actually use the number of segments in each bucket to determine the + * weight. The weight is calculated as follows: + * + * sum from i = 0 to 29 of N(i) * 2^{2i}, where N(i) is the number of free + * segments of size 2^{i + shift} + * + * N(i) * 2^i is just the space used by the segments in a bucket divided + * by the shift, and the additional factor of 2^i weights the larger + * segments more heavily. If there are any segments of size larger than + * (28 + shift), we just max out the weight. That metaslab is free enough + * for any purpose. + */ +static uint64_t +metaslab_space_weight_v2(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + if (metaslab_allocated_space(msp) == 0) { + int idx = highbit64(msp->ms_size) - shift - 1 + 3; + weight = 1ULL << MIN(METASLAB_WEIGHT_MAX_IDX, 2 * idx); + weight += highbit64(msp->ms_size) - 1; + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (metaslab_allocated_space(msp) == msp->ms_size) { + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); + } + + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) + weight = metaslab_space_weight_from_range_tree(msp); + else + weight = metaslab_space_weight_from_spacemap(msp); + ASSERT3U(weight, <=, METASLAB_WEIGHT_MAX); + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + WEIGHT_SET_SPACEBASED_V2(weight); + return (weight); +} + /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done @@ -3209,10 +3365,18 @@ metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (zfs_metaslab_space_weight_v2_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + return (metaslab_space_weight_v2(msp)); + } + /* * The baseline weight is the metaslab's free space. */ @@ -3463,9 +3627,12 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else if (WEIGHT_IS_SPACEBASED_V2(msp->ms_weight)) { + should_allocate = (asize < + 1ULL << ((msp->ms_weight & 0x3f) + 1)); } else { should_allocate = (asize <= - (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + (msp->ms_weight & ~METASLAB_WEIGHT_MASK)); } return (should_allocate); @@ -3678,14 +3845,15 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { - uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; + uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_MASK; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || + ASSERT(!(WEIGHT_IS_SPACEBASED(msp->ms_weight) && + !WEIGHT_IS_SPACEBASED_V2(msp->ms_weight)) || size >= SPA_MINBLOCKSIZE || zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); @@ -6401,6 +6569,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, space_weight_v2_enabled, INT, + ZMOD_RW, "Enable new space-based metaslab selection"); + ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching");