diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 60ec56b4d1f6..a7e105a9537e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -771,6 +771,12 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. . +.It Sy zfs_arc_evict_batches_limit Ns = Ns Sy 5 Pq uint +Number of +.Sy zfs_arc_evict_batch_limit +batches to process per parallel eviction task under heavy load to reduce number +of context switches. +. .It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int Sets the number of ARC eviction threads to be used. .Pp diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b864d9035974..8d6554a1c86e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -371,6 +371,12 @@ static uint_t zfs_arc_eviction_pct = 200; */ static uint_t zfs_arc_evict_batch_limit = 10; +/* + * Number batches to process per parallel eviction task under heavy load to + * reduce number of context switches. + */ +static uint_t zfs_arc_evict_batches_limit = 5; + /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; @@ -3900,7 +3906,7 @@ arc_set_need_free(void) static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, uint64_t bytes) + uint64_t spa, uint64_t bytes, boolean_t *more) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; @@ -3984,6 +3990,10 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, multilist_sublist_unlock(mls); + /* Indicate if another iteration may be productive. */ + if (more) + *more = (hdr != NULL); + /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is @@ -4004,21 +4014,12 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); - cv_broadcast(&aw->aew_cv); + cv_signal(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); - /* - * If the ARC size is reduced from arc_c_max to arc_c_min (especially - * if the average cached block is small), eviction can be on-CPU for - * many seconds. To ensure that other threads that may be bound to - * this CPU are able to make progress, make a voluntary preemption - * call here. - */ - kpreempt(KPREEMPT_SYNC); - return (bytes_evicted); } @@ -4079,8 +4080,18 @@ static void arc_evict_task(void *arg) { evict_arg_t *eva = arg; - eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx, - eva->eva_marker, eva->eva_spa, eva->eva_bytes); + uint64_t total_evicted = 0; + boolean_t more; + uint_t batches = zfs_arc_evict_batches_limit; + + /* Process multiple batches to amortize taskq dispatch overhead. */ + do { + total_evicted += arc_evict_state_impl(eva->eva_ml, + eva->eva_idx, eva->eva_marker, eva->eva_spa, + eva->eva_bytes - total_evicted, &more); + } while (total_evicted < eva->eva_bytes && --batches > 0 && more); + + eva->eva_evicted = total_evicted; } static void @@ -4221,18 +4232,19 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, if (bytes == ARC_EVICT_ALL) { evict = bytes; - } else if (left > ntasks * MIN_EVICT_SIZE) { + } else if (left >= ntasks * MIN_EVICT_SIZE) { evict = DIV_ROUND_UP(left, ntasks); } else { - ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE); - if (ntasks == 1) + ntasks = left / MIN_EVICT_SIZE; + if (ntasks < 2) use_evcttq = B_FALSE; + else + evict = DIV_ROUND_UP(left, ntasks); } } for (int i = 0; sublists_left > 0; i++, sublist_idx++, sublists_left--) { - uint64_t bytes_remaining; uint64_t bytes_evicted; /* we've reached the end, wrap to the beginning */ @@ -4254,16 +4266,17 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, continue; } - if (total_evicted < bytes) - bytes_remaining = bytes - total_evicted; - else - break; - bytes_evicted = arc_evict_state_impl(ml, sublist_idx, - markers[sublist_idx], spa, bytes_remaining); + markers[sublist_idx], spa, bytes - total_evicted, + NULL); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; + + if (total_evicted < bytes) + kpreempt(KPREEMPT_SYNC); + else + break; } if (use_evcttq) { @@ -4889,7 +4902,7 @@ arc_evict_cb(void *arg, zthr_t *zthr) */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { - cv_broadcast(&aw->aew_cv); + cv_signal(&aw->aew_cv); } arc_set_need_free(); } @@ -5170,9 +5183,8 @@ arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve) uint64_t last_count = 0; mutex_enter(&arc_evict_lock); - if (!list_is_empty(&arc_evict_waiters)) { - arc_evict_waiter_t *last = - list_tail(&arc_evict_waiters); + arc_evict_waiter_t *last; + if ((last = list_tail(&arc_evict_waiters)) != NULL) { last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; @@ -11290,6 +11302,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW, + "The number of batches to run per parallel eviction task"); + ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads");