diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index c609dd405..d7a950812 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -64,6 +64,8 @@ extern uint64_t zfs_wrlog_data_max; extern uint_t zfs_dirty_data_max_percent; extern uint_t zfs_dirty_data_max_max_percent; extern uint_t zfs_delay_min_dirty_percent; +extern uint_t zfs_vdev_async_write_active_min_dirty_percent; +extern uint_t zfs_vdev_async_write_active_max_dirty_percent; extern uint64_t zfs_delay_scale; /* These macros are for indexing into the zfs_all_blkstats_t. */ diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index a87d817ed..c0844dac9 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -75,18 +75,13 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags */ -#define METASLAB_HINTBP_FAVOR 0x0 -#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_ZIL 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 -#define METASLAB_DONT_THROTTLE 0x10 -#define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_ZIL 0x80 -int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, - int); +int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, + uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); @@ -103,15 +98,17 @@ void metaslab_stat_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); -metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *); +metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *, + boolean_t); void metaslab_class_destroy(metaslab_class_t *); -int metaslab_class_validate(metaslab_class_t *); +void metaslab_class_validate(metaslab_class_t *); +void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, - zio_t *, int); -void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *, + boolean_t, boolean_t *); +boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); @@ -130,9 +127,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); -void metaslab_group_alloc_decrement(spa_t *, uint64_t, const void *, int, int, - boolean_t); -void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, const void *, int); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, + const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t, boolean_t); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 5f999c02b..4408dcfdd 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -141,23 +141,24 @@ typedef enum trace_alloc_type { * Per-allocator data structure. */ typedef struct metaslab_class_allocator { + kmutex_t mca_lock; + avl_tree_t mca_tree; + metaslab_group_t *mca_rotor; uint64_t mca_aliquot; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must - * first reserve the number of blocks that it wants to allocate. + * first reserve the number of bytes that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current - * number of reserved allocations is maintained by the mca_alloc_slots - * refcount. The mca_alloc_max_slots value determines the maximum - * number of allocations that the system allows. Gang blocks are - * allowed to reserve slots even if we've reached the maximum - * number of allocations allowed. + * size of reserved allocations is maintained by mca_reserved. + * The maximum total size of reserved allocations is determined by + * mc_alloc_max in the metaslab_class_t. Gang blocks are allowed + * to reserve for their headers even if we've reached the maximum. */ - uint64_t mca_alloc_max_slots; - zfs_refcount_t mca_alloc_slots; + uint64_t mca_reserved; } ____cacheline_aligned metaslab_class_allocator_t; /* @@ -190,10 +191,10 @@ struct metaslab_class { */ uint64_t mc_groups; - /* - * Toggle to enable/disable the allocation throttle. - */ + boolean_t mc_is_log; boolean_t mc_alloc_throttle_enabled; + uint64_t mc_alloc_io_size; + uint64_t mc_alloc_max; uint64_t mc_alloc_groups; /* # of allocatable groups */ @@ -216,11 +217,10 @@ struct metaslab_class { * Per-allocator data structure. */ typedef struct metaslab_group_allocator { - uint64_t mga_cur_max_alloc_queue_depth; - zfs_refcount_t mga_alloc_queue_depth; + zfs_refcount_t mga_queue_depth; metaslab_t *mga_primary; metaslab_t *mga_secondary; -} metaslab_group_allocator_t; +} ____cacheline_aligned metaslab_group_allocator_t; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) @@ -235,6 +235,7 @@ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; + uint64_t mg_queue_target; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; @@ -246,40 +247,12 @@ struct metaslab_group { */ boolean_t mg_initialized; - uint64_t mg_free_capacity; /* percentage free */ - int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; - /* - * In order for the allocation throttle to function properly, we cannot - * have too many IOs going to each disk by default; the throttle - * operates by allocating more work to disks that finish quickly, so - * allocating larger chunks to each disk reduces its effectiveness. - * However, if the number of IOs going to each allocator is too small, - * we will not perform proper aggregation at the vdev_queue layer, - * also resulting in decreased performance. Therefore, we will use a - * ramp-up strategy. - * - * Each allocator in each metaslab group has a current queue depth - * (mg_alloc_queue_depth[allocator]) and a current max queue depth - * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group - * has an absolute max queue depth (mg_max_alloc_queue_depth). We - * add IOs to an allocator until the mg_alloc_queue_depth for that - * allocator hits the cur_max. Every time an IO completes for a given - * allocator on a given metaslab group, we increment its cur_max until - * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to - * help protect against disks that decrease in performance over time. - * - * It's possible for an allocator to handle more allocations than - * its max. This can occur when gang blocks are required or when other - * groups are unable to handle their share of allocations. - */ - uint64_t mg_max_alloc_queue_depth; - /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -288,8 +261,6 @@ struct metaslab_group { */ boolean_t mg_no_free_space; - uint64_t mg_allocations; - uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; @@ -508,7 +479,7 @@ struct metaslab { */ hrtime_t ms_load_time; /* time last loaded */ hrtime_t ms_unload_time; /* time last unloaded */ - hrtime_t ms_selected_time; /* time last allocated from */ + uint64_t ms_selected_time; /* time last allocated from (secs) */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index a3fbf8504..8c52f751a 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -59,11 +59,6 @@ extern "C" { #endif -typedef struct spa_alloc { - kmutex_t spaa_lock; - avl_tree_t spaa_tree; -} ____cacheline_aligned spa_alloc_t; - typedef struct spa_allocs_use { kmutex_t sau_lock; uint_t sau_rotor; @@ -273,12 +268,6 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ - /* - * spa_allocs is an array, whose lengths is stored in spa_alloc_count. - * There is one tree and one lock for each allocator, to help improve - * allocation performance in write-heavy workloads. - */ - spa_alloc_t *spa_allocs; spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 58a6cdcdc..a2a3e25d1 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -60,10 +60,6 @@ extern "C" { typedef struct vdev_queue vdev_queue_t; struct abd; -extern uint_t zfs_vdev_queue_depth_pct; -extern uint_t zfs_vdev_def_queue_depth; -extern uint_t zfs_vdev_async_write_max_active; - /* * Virtual device operations */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e0f29f375..e166efe86 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -245,16 +245,26 @@ For L2ARC devices less than 1 GiB, the amount of data evicts is significant compared to the amount of restored L2ARC data. In this case, do not write log blocks in L2ARC in order not to waste space. . -.It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 -Metaslab granularity, in bytes. +.It Sy metaslab_aliquot Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq u64 +Metaslab group's per child vdev allocation granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. -In normal operation, ZFS will try to write this amount of data to each disk -before moving on to the next top-level vdev. +In normal operation, ZFS will try to write this amount of data to each child +of a top-level vdev before moving on to the next top-level vdev. . .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int -Enable metaslab group biasing based on their vdevs' over- or under-utilization -relative to the pool. +Enable metaslab groups biasing based on their over- or under-utilization +relative to the metaslab class average. +If disabled, each metaslab group will receive allocations proportional to its +capacity. +. +.It Sy metaslab_perf_bias Ns = Ns Sy 1 Ns | Ns 0 Ns | Ns 2 Pq int +Controls metaslab groups biasing based on their write performance. +Setting to 0 makes all metaslab groups receive fixed amounts of allocations. +Setting to 2 allows faster metaslab groups to allocate more. +Setting to 1 equals to 2 if the pool is write-bound or 0 otherwise. +That is, if the pool is limited by write throughput, then allocate more from +faster metaslab groups, but if not, try to evenly distribute the allocations. . .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64 Make some blocks above a certain size be gang blocks. @@ -1527,23 +1537,6 @@ This enforced wait ensures the HDD services the interactive I/O within a reasonable amount of time. .No See Sx ZFS I/O SCHEDULER . . -.It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint -Maximum number of queued allocations per top-level vdev expressed as -a percentage of -.Sy zfs_vdev_async_write_max_active , -which allows the system to detect devices that are more capable -of handling allocations and to allocate more blocks to those devices. -This allows for dynamic allocation distribution when devices are imbalanced, -as fuller devices will tend to be slower than empty devices. -.Pp -Also see -.Sy zio_dva_throttle_enabled . -. -.It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint -Default queue depth for each vdev IO allocator. -Higher values allow for better coalescing of sequential writes before sending -them to the disk, but can increase transaction commit times. -. .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: @@ -2488,10 +2481,7 @@ Slow I/O counters can be seen with . .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Throttle block allocations in the I/O pipeline. -This allows for dynamic allocation distribution when devices are imbalanced. -When enabled, the maximum number of pending allocations per top-level vdev -is limited by -.Sy zfs_vdev_queue_depth_pct . +This allows for dynamic allocation distribution based on device performance. . .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int Control the naming scheme used when setting new xattrs in the user namespace. diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d738eda60..e723d7640 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -45,12 +45,12 @@ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* - * Metaslab granularity, in bytes. This is roughly similar to what would be - * referred to as the "stripe size" in traditional RAID arrays. In normal - * operation, we will try to write this amount of data to each disk before - * moving on to the next top-level vdev. + * Metaslab group's per child vdev granularity, in bytes. This is roughly + * similar to what would be referred to as the "stripe size" in traditional + * RAID arrays. In normal operation, we will try to write this amount of + * data to each disk before moving on to the next top-level vdev. */ -static uint64_t metaslab_aliquot = 1024 * 1024; +static uint64_t metaslab_aliquot = 2 * 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. @@ -238,10 +238,15 @@ static int metaslab_fragmentation_factor_enabled = B_TRUE; static int metaslab_lba_weighting_enabled = B_TRUE; /* - * Enable/disable metaslab group biasing. + * Enable/disable space-based metaslab group biasing. */ static int metaslab_bias_enabled = B_TRUE; +/* + * Control performance-based metaslab group biasing. + */ +static int metaslab_perf_bias = 1; + /* * Enable/disable remapping of indirect DVAs to their concrete vdevs. */ @@ -406,7 +411,7 @@ metaslab_stat_fini(void) * ========================================================================== */ metaslab_class_t * -metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) +metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops, boolean_t is_log) { metaslab_class_t *mc; @@ -415,13 +420,19 @@ metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) mc->mc_spa = spa; mc->mc_ops = ops; + mc->mc_is_log = is_log; + mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; + mc->mc_alloc_max = UINT64_MAX; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&mca->mca_tree, zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); mca->mca_rotor = NULL; - zfs_refcount_create_tracked(&mca->mca_alloc_slots); + mca->mca_reserved = 0; } return (mc); @@ -439,8 +450,10 @@ metaslab_class_destroy(metaslab_class_t *mc) for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + avl_destroy(&mca->mca_tree); + mutex_destroy(&mca->mca_lock); ASSERT(mca->mca_rotor == NULL); - zfs_refcount_destroy(&mca->mca_alloc_slots); + ASSERT0(mca->mca_reserved); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); @@ -448,11 +461,52 @@ metaslab_class_destroy(metaslab_class_t *mc) mc_allocator[spa->spa_alloc_count])); } -int +void metaslab_class_validate(metaslab_class_t *mc) { - metaslab_group_t *mg; - vdev_t *vd; +#ifdef ZFS_DEBUG + spa_t *spa = mc->mc_spa; + + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) || + spa_config_held(spa, SCL_ALL, RW_WRITER)); + + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + metaslab_group_t *mg, *rotor; + + ASSERT0(avl_numnodes(&mca->mca_tree)); + ASSERT0(mca->mca_reserved); + + if ((mg = rotor = mca->mca_rotor) == NULL) + continue; + do { + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + vdev_t *vd = mg->mg_vd; + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + ASSERT0(zfs_refcount_count(&mga->mga_queue_depth)); + } while ((mg = mg->mg_next) != rotor); + } +#endif +} + +/* + * For each metaslab group in a class pre-calculate allocation quota and + * target queue depth to balance their space usage and write performance. + * Based on those pre-calculate class allocation throttle threshold for + * optimal saturation. onsync is true once per TXG to enable/disable + * allocation throttling and update moving average of maximum I/O size. + */ +void +metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync) +{ + metaslab_group_t *mg, *first; /* * Must hold one of the spa_config locks. @@ -460,18 +514,168 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) - return (0); + if (onsync) + metaslab_class_validate(mc); + if (mc->mc_groups == 0) { + if (onsync) + mc->mc_alloc_throttle_enabled = B_FALSE; + mc->mc_alloc_max = UINT64_MAX; + return; + } + + if (onsync) { + /* + * Moving average of maximum allocation size, in absence of + * large allocations shrinking to 1/8 of metaslab_aliquot. + */ + mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size + + metaslab_aliquot / 8) / 4; + mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 : + zio_dva_throttle_enabled; + } + + mg = first = mc->mc_allocator[0].mca_rotor; + uint64_t children = 0; do { - vd = mg->mg_vd; - ASSERT(vd->vdev_mg != NULL); - ASSERT3P(vd->vdev_top, ==, vd); - ASSERT3P(mg->mg_class, ==, mc); - ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); - } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); + children += vdev_get_ndisks(mg->mg_vd) - + vdev_get_nparity(mg->mg_vd); + } while ((mg = mg->mg_next) != first); - return (0); + uint64_t sum_aliquot = 0; + do { + vdev_stat_t *vs = &mg->mg_vd->vdev_stat; + uint_t ratio; + + /* + * Scale allocations per iteration with average number of + * children. Wider vdevs need more sequential allocations + * to keep decent per-child I/O size. + */ + uint64_t mg_aliquot = MAX(metaslab_aliquot * children / + mc->mc_groups, mc->mc_alloc_io_size * 4); + + /* + * Scale allocations per iteration with the vdev capacity, + * relative to average. Bigger vdevs should get more to + * fill up at the same time as smaller ones. + */ + if (mc->mc_space > 0 && vs->vs_space > 0) { + ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups * + 256) + 1); + mg_aliquot = mg_aliquot * ratio / 256; + } + + /* + * Scale allocations per iteration with the vdev's free space + * fraction, relative to average. Despite the above, vdevs free + * space fractions may get imbalanced, for example due to new + * vdev addition or different performance. We want free space + * fractions to be similar to postpone fragmentation. + * + * But same time we don't want to throttle vdevs still having + * plenty of free space, that appear faster than others, even + * if that cause temporary imbalance. Allow them to allocate + * more by keeping their allocation queue depth equivalent to + * 2.5 full iteration, even if they repeatedly drain it. Later + * with the free space reduction gradually reduce the target + * queue depth, stronger enforcing the free space balance. + */ + if (metaslab_bias_enabled && + mc->mc_space > 0 && vs->vs_space > 0) { + uint64_t vs_free = vs->vs_space > vs->vs_alloc ? + vs->vs_space - vs->vs_alloc : 0; + uint64_t mc_free = mc->mc_space > mc->mc_alloc ? + mc->mc_space - mc->mc_alloc : 0; + /* + * vs_fr is 16 bit fixed-point free space fraction. + * mc_fr is 8 bit fixed-point free space fraction. + * ratio as their quotient is 8 bit fixed-point. + */ + uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1); + uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1); + ratio = vs_fr / (mc_fr + 1); + mg->mg_aliquot = mg_aliquot * ratio / 256; + /* From 2.5x at 25% full to 1x at 75%. */ + ratio = MIN(163840, vs_fr * 3 + 16384); + mg->mg_queue_target = MAX(mg->mg_aliquot, + mg->mg_aliquot * ratio / 65536); + } else { + mg->mg_aliquot = mg_aliquot; + mg->mg_queue_target = mg->mg_aliquot * 2; + } + sum_aliquot += mg->mg_aliquot; + } while ((mg = mg->mg_next) != first); + + /* + * Set per-class allocation throttle threshold to 4 iterations through + * all the vdevs. This should keep all vdevs busy even if some are + * allocating more than we planned for them due to bigger blocks or + * better performance. + */ + mc->mc_alloc_max = sum_aliquot * 4; +} + +static void +metaslab_class_rotate(metaslab_group_t *mg, int allocator, uint64_t psize, + boolean_t success) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + + /* + * Exit fast if there is nothing to rotate, we are not following + * the rotor (copies, gangs, etc) or somebody already rotated it. + */ + if (mc->mc_groups < 2 || mca->mca_rotor != mg) + return; + + /* + * Always rotate in case of allocation error or a log class. + */ + if (!success || mc->mc_is_log) + goto rotate; + + /* + * Allocate from this group if we expect next I/O of the same size to + * mostly fit within the allocation quota. Rotate if we expect it to + * mostly go over the target queue depth. Meanwhile, to stripe between + * groups in configured amounts per child even if we can't reach the + * target queue depth, i.e. can't saturate the group write performance, + * always rotate after allocating the queue target bytes. + */ + uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2; + if (naq < mg->mg_aliquot) + return; + if (naq >= mg->mg_queue_target) + goto rotate; + if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >= + mg->mg_queue_target) + goto rotate; + + /* + * When the pool is not too busy, prefer restoring the vdev free space + * balance instead of getting maximum speed we might not need, so that + * we could have more flexibility during more busy times later. + */ + if (metaslab_perf_bias <= 0) + goto rotate; + if (metaslab_perf_bias >= 2) + return; + spa_t *spa = mc->mc_spa; + dsl_pool_t *dp = spa_get_dsl(spa); + if (dp == NULL) + return; + uint64_t busy_thresh = zfs_dirty_data_max * + (zfs_vdev_async_write_active_min_dirty_percent + + zfs_vdev_async_write_active_max_dirty_percent) / 200; + if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa)) + return; + +rotate: + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } static void @@ -640,7 +844,9 @@ void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; - hrtime_t now = gethrtime(); + uint64_t now = gethrestime_sec(); + /* Round delay up to next second. */ + uint_t delay = (metaslab_unload_delay_ms + 999) / 1000; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); @@ -664,8 +870,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && - now > msp->ms_selected_time + - MSEC2NSEC(metaslab_unload_delay_ms) && + now > msp->ms_selected_time + delay && (msp->ms_allocator == -1 || !metaslab_preload_enabled)) { metaslab_evict(msp, txg); @@ -753,7 +958,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; - mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); @@ -783,7 +988,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && - mg->mg_free_capacity > zfs_mg_noalloc_threshold && + free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); @@ -851,7 +1056,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; - zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); + zfs_refcount_create_tracked(&mga->mga_queue_depth); } return (mg); @@ -876,7 +1081,7 @@ metaslab_group_destroy(metaslab_group_t *mg) for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; - zfs_refcount_destroy(&mga->mga_alloc_queue_depth); + zfs_refcount_destroy(&mga->mga_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[mg->mg_allocators])); @@ -898,8 +1103,6 @@ metaslab_group_activate(metaslab_group_t *mg) if (++mg->mg_activation_count <= 0) return; - mg->mg_aliquot = metaslab_aliquot * MAX(1, - vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { @@ -916,6 +1119,7 @@ metaslab_group_activate(metaslab_group_t *mg) mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } + metaslab_class_balance(mc, B_FALSE); } /* @@ -996,6 +1200,7 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_prev = NULL; mg->mg_next = NULL; + metaslab_class_balance(mc, B_FALSE); } boolean_t @@ -1214,127 +1419,6 @@ metaslab_group_fragmentation(metaslab_group_t *mg) return (fragmentation); } -/* - * Determine if a given metaslab group should skip allocations. A metaslab - * group should avoid allocations if its free capacity is less than the - * zfs_mg_noalloc_threshold or its fragmentation metric is greater than - * zfs_mg_fragmentation_threshold and there is at least one metaslab group - * that can still handle allocations. If the allocation throttle is enabled - * then we skip allocations to devices that have reached their maximum - * allocation queue depth unless the selected metaslab group is the only - * eligible group remaining. - */ -static boolean_t -metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - int flags, uint64_t psize, int allocator, int d) -{ - spa_t *spa = mg->mg_vd->vdev_spa; - metaslab_class_t *mc = mg->mg_class; - - /* - * We can only consider skipping this metaslab group if it's - * in the normal metaslab class and there are other metaslab - * groups to select from. Otherwise, we always consider it eligible - * for allocations. - */ - if ((mc != spa_normal_class(spa) && - mc != spa_special_class(spa) && - mc != spa_dedup_class(spa)) || - mc->mc_groups <= 1) - return (B_TRUE); - - /* - * If the metaslab group's mg_allocatable flag is set (see comments - * in metaslab_group_alloc_update() for more information) and - * the allocation throttle is disabled then allow allocations to this - * device. However, if the allocation throttle is enabled then - * check if we have reached our allocation limit (mga_alloc_queue_depth) - * to determine if we should allow allocations to this metaslab group. - * If all metaslab groups are no longer considered allocatable - * (mc_alloc_groups == 0) or we're trying to allocate the smallest - * gang block size then we allow allocations on this metaslab group - * regardless of the mg_allocatable or throttle settings. - */ - if (mg->mg_allocatable) { - metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - int64_t qdepth; - uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; - - if (!mc->mc_alloc_throttle_enabled) - return (B_TRUE); - - /* - * If this metaslab group does not have any free space, then - * there is no point in looking further. - */ - if (mg->mg_no_free_space) - return (B_FALSE); - - /* - * Some allocations (e.g., those coming from device removal - * where the * allocations are not even counted in the - * metaslab * allocation queues) are allowed to bypass - * the throttle. - */ - if (flags & METASLAB_DONT_THROTTLE) - return (B_TRUE); - - /* - * Relax allocation throttling for ditto blocks. Due to - * random imbalances in allocation it tends to push copies - * to one vdev, that looks a bit better at the moment. - */ - qmax = qmax * (4 + d) / 4; - - qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); - - /* - * If this metaslab group is below its qmax or it's - * the only allocatable metaslab group, then attempt - * to allocate from it. - */ - if (qdepth < qmax || mc->mc_alloc_groups == 1) - return (B_TRUE); - ASSERT3U(mc->mc_alloc_groups, >, 1); - - /* - * Since this metaslab group is at or over its qmax, we - * need to determine if there are metaslab groups after this - * one that might be able to handle this allocation. This is - * racy since we can't hold the locks for all metaslab - * groups at the same time when we make this check. - */ - for (metaslab_group_t *mgp = mg->mg_next; - mgp != rotor; mgp = mgp->mg_next) { - metaslab_group_allocator_t *mgap = - &mgp->mg_allocator[allocator]; - qmax = mgap->mga_cur_max_alloc_queue_depth; - qmax = qmax * (4 + d) / 4; - qdepth = - zfs_refcount_count(&mgap->mga_alloc_queue_depth); - - /* - * If there is another metaslab group that - * might be able to handle the allocation, then - * we return false so that we skip this group. - */ - if (qdepth < qmax && !mgp->mg_no_free_space) - return (B_FALSE); - } - - /* - * We didn't find another group to handle the allocation - * so we can't skip this metaslab group even though - * we are at or over our qmax. - */ - return (B_TRUE); - - } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { - return (B_TRUE); - } - return (B_FALSE); -} - /* * ========================================================================== * Range tree callbacks @@ -2615,15 +2699,15 @@ metaslab_unload(metaslab_t *msp) spa_t *spa = msp->ms_group->mg_vd->vdev_spa; zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, weight %llx, " - "selected txg %llu (%llu ms ago), alloc_txg %llu, " + "selected txg %llu (%llu s ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, - (u_longlong_t)(msp->ms_unload_time - - msp->ms_selected_time) / 1000 / 1000, + (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) - + msp->ms_selected_time), (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, @@ -2679,7 +2763,7 @@ metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; - msp->ms_selected_time = gethrtime(); + msp->ms_selected_time = gethrestime_sec(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } @@ -4607,11 +4691,10 @@ metaslab_trace_fini(zio_alloc_list_t *zal) */ static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, - int flags, int allocator) +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, + int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) + if (!(flags & METASLAB_ASYNC_ALLOC)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4619,33 +4702,14 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); -} - -static void -metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) -{ - metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - metaslab_class_allocator_t *mca = - &mg->mg_class->mc_allocator[allocator]; - uint64_t max = mg->mg_max_alloc_queue_depth; - uint64_t cur = mga->mga_cur_max_alloc_queue_depth; - while (cur < max) { - if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, - cur, cur + 1) == cur) { - atomic_inc_64(&mca->mca_alloc_max_slots); - return; - } - cur = mga->mga_cur_max_alloc_queue_depth; - } + (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); } void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, - int flags, int allocator, boolean_t io_complete) +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator, + int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) + if (!(flags & METASLAB_ASYNC_ALLOC)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4653,26 +4717,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); - if (io_complete) - metaslab_group_increment_qdepth(mg, allocator); -} - -void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, - int allocator) -{ -#ifdef ZFS_DEBUG - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - - for (int d = 0; d < ndvas; d++) { - uint64_t vdev = DVA_GET_VDEV(&dva[d]); - metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); - } -#endif + (void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag); } static uint64_t @@ -4731,7 +4776,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, - dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, + dva_t *dva, int d, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { @@ -4775,13 +4820,14 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; - for (i = 0; i < d; i++) { - if (want_unique && - !metaslab_is_unique(msp, &dva[i])) - break; /* try another metaslab */ + if (!try_hard) { + for (i = 0; i < d; i++) { + if (!metaslab_is_unique(msp, &dva[i])) + break; /* try another metaslab */ + } + if (i == d) + break; } - if (i == d) - break; } if (msp != NULL) { @@ -4829,9 +4875,9 @@ metaslab_active_mask_verify(metaslab_t *msp) } static uint64_t -metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, - int allocator, boolean_t try_hard) +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator, + boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -4907,15 +4953,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, try_hard, zal, - search, &was_active); + asize, allocator, try_hard, zal, search, + &was_active); } mutex_exit(&mg->mg_lock); - if (msp == NULL) { - kmem_free(search, sizeof (*search)); - return (-1ULL); - } + if (msp == NULL) + break; mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); @@ -5056,6 +5100,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); + mutex_exit(&msp->ms_lock); break; } next: @@ -5129,46 +5174,73 @@ next: mutex_exit(&msp->ms_lock); } - mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); - return (offset); -} -static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, - int allocator, boolean_t try_hard) -{ - uint64_t offset; - - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator, try_hard); - - mutex_enter(&mg->mg_lock); if (offset == -1ULL) { - mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); - if (asize == SPA_GANGBLOCKSIZE) { + if (asize <= vdev_get_min_alloc(mg->mg_vd)) { /* * This metaslab group was unable to allocate - * the minimum gang block size so it must be out of - * space. We must notify the allocation throttle - * to start skipping allocation attempts to this - * metaslab group until more space becomes available. - * Note: this failure cannot be caused by the - * allocation throttle since the allocation throttle - * is only responsible for skipping devices and - * not failing block allocations. + * the minimum block size so it must be out of + * space. Notify the allocation throttle to + * skip allocation attempts to this group until + * more space becomes available. */ mg->mg_no_free_space = B_TRUE; } } - mg->mg_allocations++; - mutex_exit(&mg->mg_lock); return (offset); } +static boolean_t +metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize, + int d, int flags, boolean_t try_hard, zio_alloc_list_t *zal, int allocator) +{ + metaslab_class_t *mc = mg->mg_class; + vdev_t *vd = mg->mg_vd; + boolean_t allocatable; + + /* + * Don't allocate from faulted devices. + */ + if (try_hard) + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + if (try_hard) + spa_config_exit(spa, SCL_ZIO, FTAG); + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE, allocator); + return (B_FALSE); + } + + if (!try_hard) { + /* + * Avoid vdevs with too little space or too fragmented. + */ + if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space || + (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE, allocator); + return (B_FALSE); + } + + /* + * Avoid writing single-copy data to an unhealthy, + * non-redundant vdev. + */ + if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY && + vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR, allocator); + return (B_FALSE); + } + } + + return (B_TRUE); +} + /* * Allocate a block for the specified i/o. */ @@ -5178,7 +5250,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - metaslab_group_t *mg, *rotor; + metaslab_group_t *mg = NULL, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5217,170 +5289,50 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep - * ourselves on the same vdev as our gang block header. That - * way, we can hope for locality in vdev_cache, plus it makes our + * ourselves on the same vdev as our gang block header. It makes our * fault domains something tractable. */ - if (hintdva) { + if (hintdva && DVA_IS_VALID(&hintdva[d])) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - - /* - * It's possible the vdev we're using as the hint no - * longer exists or its mg has been closed (e.g. by - * device removal). Consult the rotor when - * all else fails. - */ - if (vd != NULL && vd->vdev_mg != NULL) { - mg = vdev_get_mg(vd, mc); - - if (flags & METASLAB_HINTBP_AVOID) - mg = mg->mg_next; - } else { - mg = mca->mca_rotor; - } - } else if (d != 0) { + mg = vdev_get_mg(vd, mc); + } + if (mg == NULL && d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); - mg = vd->vdev_mg->mg_next; - } else { + mg = vdev_get_mg(vd, mc)->mg_next; + } + if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } - /* - * If the hint put us into the wrong metaslab class, or into a - * metaslab group that has been passivated, just follow the rotor. - */ - if (mg->mg_class != mc || mg->mg_activation_count <= 0) - mg = mca->mca_rotor; - rotor = mg; top: do { - boolean_t allocatable; - ASSERT(mg->mg_activation_count == 1); - vd = mg->mg_vd; - - /* - * Don't allocate from faulted devices. - */ - if (try_hard) { - spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); - allocatable = vdev_allocatable(vd); - spa_config_exit(spa, SCL_ZIO, FTAG); - } else { - allocatable = vdev_allocatable(vd); - } - - /* - * Determine if the selected metaslab group is eligible - * for allocations. If we're ganging then don't allow - * this metaslab group to skip allocations since that would - * inadvertently return ENOSPC and suspend the pool - * even though space is still available. - */ - if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { - allocatable = metaslab_group_allocatable(mg, rotor, - flags, psize, allocator, d); - } - - if (!allocatable) { - metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_NOT_ALLOCATABLE, allocator); - goto next; - } - - /* - * Avoid writing single-copy data to an unhealthy, - * non-redundant vdev, unless we've already tried all - * other vdevs. - */ - if (vd->vdev_state < VDEV_STATE_HEALTHY && - d == 0 && !try_hard && vd->vdev_children == 0) { - metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_VDEV_ERROR, allocator); - goto next; - } - ASSERT(mg->mg_class == mc); + if (!metaslab_group_allocatable(spa, mg, psize, d, flags, + try_hard, zal, allocator)) + goto next; + + vd = mg->mg_vd; uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - - /* - * If we don't need to try hard, then require that the - * block be on a different metaslab from any other DVAs - * in this BP (unique=true). If we are trying hard, then - * allow any metaslab to be used (unique=false). - */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator, try_hard); + dva, d, allocator, try_hard); if (offset != -1ULL) { - /* - * If we've just selected this metaslab group, - * figure out whether the corresponding vdev is - * over- or under-used relative to the pool, - * and set an allocation bias to even it out. - * - * Bias is also used to compensate for unequally - * sized vdevs so that space is allocated fairly. - */ - if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { - vdev_stat_t *vs = &vd->vdev_stat; - int64_t vs_free = vs->vs_space - vs->vs_alloc; - int64_t mc_free = mc->mc_space - mc->mc_alloc; - int64_t ratio; - - /* - * Calculate how much more or less we should - * try to allocate from this device during - * this iteration around the rotor. - * - * This basically introduces a zero-centered - * bias towards the devices with the most - * free space, while compensating for vdev - * size differences. - * - * Examples: - * vdev V1 = 16M/128M - * vdev V2 = 16M/128M - * ratio(V1) = 100% ratio(V2) = 100% - * - * vdev V1 = 16M/128M - * vdev V2 = 64M/128M - * ratio(V1) = 127% ratio(V2) = 72% - * - * vdev V1 = 16M/128M - * vdev V2 = 64M/512M - * ratio(V1) = 40% ratio(V2) = 160% - */ - ratio = (vs_free * mc->mc_alloc_groups * 100) / - (mc_free + 1); - mg->mg_bias = ((ratio - 100) * - (int64_t)mg->mg_aliquot) / 100; - } else if (!metaslab_bias_enabled) { - mg->mg_bias = 0; - } - - if ((flags & METASLAB_ZIL) || - atomic_add_64_nv(&mca->mca_aliquot, asize) >= - mg->mg_aliquot + mg->mg_bias) { - mca->mca_rotor = mg->mg_next; - mca->mca_aliquot = 0; - } + metaslab_class_rotate(mg, allocator, psize, B_TRUE); DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); - return (0); } next: - mca->mca_rotor = mg->mg_next; - mca->mca_aliquot = 0; + metaslab_class_rotate(mg, allocator, psize, B_FALSE); } while ((mg = mg->mg_next) != rotor); /* @@ -5388,7 +5340,7 @@ next: */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || - psize <= 1 << spa->spa_min_ashift)) { + psize <= spa->spa_min_alloc)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; @@ -5700,41 +5652,44 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) * the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, - zio_t *zio, int flags) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, + boolean_t must, boolean_t *more) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - uint64_t max = mca->mca_alloc_max_slots; + metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || - zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { + if (mc->mc_alloc_io_size < zio->io_size) { + mc->mc_alloc_io_size = zio->io_size; + metaslab_class_balance(mc, B_FALSE); + } + if (must || mca->mca_reserved <= mc->mc_alloc_max) { /* - * The potential race between _count() and _add() is covered - * by the allocator lock in most cases, or irrelevant due to - * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. + * The potential race between compare and add is covered by the + * allocator lock in most cases, or irrelevant due to must set. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. - * - * We reserve the slots individually so that we can unreserve - * them individually when an I/O completes. */ - zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); + int64_t delta = slots * zio->io_size; + *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= + mc->mc_alloc_max); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } + *more = B_FALSE; return (B_FALSE); } -void +boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, - int allocator, zio_t *zio) + zio_t *zio) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); + int64_t delta = slots * zio->io_size; + return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= + mc->mc_alloc_max); } static int @@ -5876,7 +5831,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, - zio_alloc_list_t *zal, zio_t *zio, int allocator) + zio_alloc_list_t *zal, int allocator, const void *tag) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; @@ -5905,8 +5860,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, - DVA_GET_VDEV(&dva[d]), zio, flags, - allocator, B_FALSE); + DVA_GET_VDEV(&dva[d]), allocator, flags, + psize, tag); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -5917,7 +5872,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, - DVA_GET_VDEV(&dva[d]), zio, flags, allocator); + DVA_GET_VDEV(&dva[d]), allocator, flags, psize, + tag); } } ASSERT(error == 0); @@ -6280,7 +6236,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, - "Enable metaslab group biasing"); + "Enable space-based metaslab group biasing"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW, + "Enable performance-based metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 093456347..cbe903030 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1686,11 +1686,11 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; - spa->spa_normal_class = metaslab_class_create(spa, msp); - spa->spa_log_class = metaslab_class_create(spa, msp); - spa->spa_embedded_log_class = metaslab_class_create(spa, msp); - spa->spa_special_class = metaslab_class_create(spa, msp); - spa->spa_dedup_class = metaslab_class_create(spa, msp); + spa->spa_normal_class = metaslab_class_create(spa, msp, B_FALSE); + spa->spa_log_class = metaslab_class_create(spa, msp, B_TRUE); + spa->spa_embedded_log_class = metaslab_class_create(spa, msp, B_TRUE); + spa->spa_special_class = metaslab_class_create(spa, msp, B_FALSE); + spa->spa_dedup_class = metaslab_class_create(spa, msp, B_FALSE); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); @@ -9883,60 +9883,9 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); - vdev_t *rvd = spa->spa_root_vdev; - uint32_t max_queue_depth = zfs_vdev_async_write_max_active * - zfs_vdev_queue_depth_pct / 100; - metaslab_class_t *normal = spa_normal_class(spa); - metaslab_class_t *special = spa_special_class(spa); - metaslab_class_t *dedup = spa_dedup_class(spa); - - uint64_t slots_per_allocator = 0; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - - metaslab_group_t *mg = tvd->vdev_mg; - if (mg == NULL || !metaslab_group_initialized(mg)) - continue; - - metaslab_class_t *mc = mg->mg_class; - if (mc != normal && mc != special && mc != dedup) - continue; - - /* - * It is safe to do a lock-free check here because only async - * allocations look at mg_max_alloc_queue_depth, and async - * allocations all happen from spa_sync(). - */ - for (int i = 0; i < mg->mg_allocators; i++) { - ASSERT0(zfs_refcount_count( - &(mg->mg_allocator[i].mga_alloc_queue_depth))); - } - mg->mg_max_alloc_queue_depth = max_queue_depth; - - for (int i = 0; i < mg->mg_allocators; i++) { - mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = - zfs_vdev_def_queue_depth; - } - slots_per_allocator += zfs_vdev_def_queue_depth; - } - - for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. - mca_alloc_slots)); - ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. - mca_alloc_slots)); - ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. - mca_alloc_slots)); - normal->mc_allocator[i].mca_alloc_max_slots = - slots_per_allocator; - special->mc_allocator[i].mca_alloc_max_slots = - slots_per_allocator; - dedup->mc_allocator[i].mca_alloc_max_slots = - slots_per_allocator; - } - normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + metaslab_class_balance(spa_normal_class(spa), B_TRUE); + metaslab_class_balance(spa_special_class(spa), B_TRUE); + metaslab_class_balance(spa_dedup_class(spa), B_TRUE); } static void @@ -10156,12 +10105,6 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_allocs[i].spaa_lock); - VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); - mutex_exit(&spa->spa_allocs[i].spaa_lock); - } - /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -10274,12 +10217,6 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_allocs[i].spaa_lock); - VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); - mutex_exit(&spa->spa_allocs[i].spaa_lock); - } - /* * Update usable space statistics. */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 3d98a0449..f2b029778 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -759,14 +759,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_alloc_count = MAX(MIN(spa_num_allocators, boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); - spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * - sizeof (spa_alloc_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, - NULL); - avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); - } if (spa->spa_alloc_count > 1) { spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count]), KM_SLEEP); @@ -862,12 +854,6 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } - for (int i = 0; i < spa->spa_alloc_count; i++) { - avl_destroy(&spa->spa_allocs[i].spaa_tree); - mutex_destroy(&spa->spa_allocs[i].spaa_lock); - } - kmem_free(spa->spa_allocs, spa->spa_alloc_count * - sizeof (spa_alloc_t)); if (spa->spa_alloc_count > 1) { mutex_destroy(&spa->spa_allocs_use->sau_lock); kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, @@ -1318,11 +1304,11 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, /* * Verify the metaslab classes. */ - ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); + metaslab_class_validate(spa_normal_class(spa)); + metaslab_class_validate(spa_log_class(spa)); + metaslab_class_validate(spa_embedded_log_class(spa)); + metaslab_class_validate(spa_special_class(spa)); + metaslab_class_validate(spa_dedup_class(spa)); spa_config_exit(spa, SCL_ALL, spa); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 393fb9515..aa41f7066 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -149,7 +149,7 @@ static uint_t zfs_vdev_sync_write_max_active = 10; static uint_t zfs_vdev_async_read_min_active = 1; /* */ uint_t zfs_vdev_async_read_max_active = 3; static uint_t zfs_vdev_async_write_min_active = 2; -/* */ uint_t zfs_vdev_async_write_max_active = 10; +static uint_t zfs_vdev_async_write_max_active = 10; static uint_t zfs_vdev_scrub_min_active = 1; static uint_t zfs_vdev_scrub_max_active = 3; static uint_t zfs_vdev_removal_min_active = 1; @@ -204,31 +204,6 @@ static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; static uint_t zfs_vdev_read_gap_limit = 32 << 10; static uint_t zfs_vdev_write_gap_limit = 4 << 10; -/* - * Define the queue depth percentage for each top-level. This percentage is - * used in conjunction with zfs_vdev_async_max_active to determine how many - * allocations a specific top-level vdev should handle. Once the queue depth - * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 - * then allocator will stop allocating blocks on that top-level device. - * The default kernel setting is 1000% which will yield 100 allocations per - * device. For userland testing, the default setting is 300% which equates - * to 30 allocations per device. - */ -#ifdef _KERNEL -uint_t zfs_vdev_queue_depth_pct = 1000; -#else -uint_t zfs_vdev_queue_depth_pct = 300; -#endif - -/* - * When performing allocations for a given metaslab, we want to make sure that - * there are enough IOs to aggregate together to improve throughput. We want to - * ensure that there are at least 128k worth of IOs that can be aggregated, and - * we assume that the average allocation size is 4k, so we need the queue depth - * to be 32 per allocator to get good aggregation of sequential writes. - */ -uint_t zfs_vdev_def_queue_depth = 32; - static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -1168,9 +1143,3 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW, "Number of non-interactive I/Os before _max_active"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, - "Queue depth percentage for each top-level vdev"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, - "Default queue depth for each allocator"); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 90d25555d..a94ac9e60 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1172,10 +1172,10 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, if (mc->mc_groups == 0) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, - METASLAB_DONT_THROTTLE, zal, 0); + 0, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, - &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); + &dst, 0, NULL, txg, 0, zal, 0); } if (error != 0) return (error); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 63f57cf26..5d39d28a7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3134,8 +3134,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; - uint64_t lsize; - int copies = gio->io_prop.zp_copies; + uint64_t psize; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); @@ -3150,47 +3149,18 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP); ASSERT(ZIO_HAS_ALLOCATOR(pio)); - int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + int flags = METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. - mca_alloc_slots, pio)); - - /* - * The logical zio has already placed a reservation for - * 'copies' allocation slots but gang blocks may require - * additional copies. These additional copies - * (i.e. gbh_copies - copies) are guaranteed to succeed - * since metaslab_class_throttle_reserve() always allows - * additional reservations for gang blocks. - */ - ASSERT3U(gbh_copies, >=, copies); - VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio, pio->io_allocator); + &pio->io_alloc_list, pio->io_allocator, pio); if (error) { - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(has_data); - - /* - * If we failed to allocate the gang block header then - * we remove any additional allocation reservations that - * we placed here. The original reservation will - * be removed when the logical I/O goes to the ready - * stage. - */ - metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio->io_allocator, pio); - } - pio->io_error = error; return (pio); } @@ -3215,14 +3185,20 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + boolean_t more; + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies, + zio, B_TRUE, &more)); + } /* * Create and nowait the gang children. */ - for (int g = 0; resid != 0; resid -= lsize, g++) { - lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), - SPA_MINBLOCKSIZE); - ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); + for (int g = 0; resid != 0; resid -= psize, g++) { + psize = zio_roundup_alloc_size(spa, + resid / (SPA_GBH_NBLKPTRS - g)); + psize = MIN(resid, psize); + ASSERT3U(psize, >=, SPA_MINBLOCKSIZE); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; @@ -3243,25 +3219,20 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - - resid) : NULL, lsize, lsize, &zp, + resid) : NULL, psize, psize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(zio, cio); + /* + * We do not reserve for the child writes, since we already + * reserved for the parent. Unreserve though will be called + * for individual children. We can do this since sum of all + * child's physical sizes is equal to parent's physical size. + * It would not work for potentially bigger allocation sizes. + */ - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(has_data); - - /* - * Gang children won't throttle but we should - * account for their work, so reserve an allocation - * slot for them here. - */ - VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio->io_allocator, cio, flags)); - } zio_nowait(cio); } @@ -4029,15 +4000,17 @@ zio_ddt_free(zio_t *zio) */ static zio_t * -zio_io_to_allocate(spa_t *spa, int allocator) +zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); + ASSERT(MUTEX_HELD(&mca->mca_lock)); - zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); - if (zio == NULL) + zio = avl_first(&mca->mca_tree); + if (zio == NULL) { + *more = B_FALSE; return (NULL); + } ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(ZIO_HAS_ALLOCATOR(zio)); @@ -4046,15 +4019,16 @@ zio_io_to_allocate(spa_t *spa, int allocator) * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ - ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, allocator, zio, 0)) { + zio->io_prop.zp_copies, zio, B_FALSE, more)) { return (NULL); } - avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); + avl_remove(&mca->mca_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + if (avl_is_empty(&mca->mca_tree)) + *more = B_FALSE; return (zio); } @@ -4064,9 +4038,14 @@ zio_dva_throttle(zio_t *zio) spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; + boolean_t more; - /* locate an appropriate allocation class */ - mc = spa_preferred_class(spa, zio); + /* + * If not already chosen, choose an appropriate allocation class. + */ + mc = zio->io_metaslab_class; + if (mc == NULL) + mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || @@ -4081,29 +4060,33 @@ zio_dva_throttle(zio_t *zio) ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - int allocator = zio->io_allocator; zio->io_metaslab_class = mc; - mutex_enter(&spa->spa_allocs[allocator].spaa_lock); - avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); - nio = zio_io_to_allocate(spa, allocator); - mutex_exit(&spa->spa_allocs[allocator].spaa_lock); + metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; + mutex_enter(&mca->mca_lock); + avl_add(&mca->mca_tree, zio); + nio = zio_io_to_allocate(mca, &more); + mutex_exit(&mca->mca_lock); return (nio); } static void -zio_allocate_dispatch(spa_t *spa, int allocator) +zio_allocate_dispatch(metaslab_class_t *mc, int allocator) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; zio_t *zio; + boolean_t more; - mutex_enter(&spa->spa_allocs[allocator].spaa_lock); - zio = zio_io_to_allocate(spa, allocator); - mutex_exit(&spa->spa_allocs[allocator].spaa_lock); - if (zio == NULL) - return; + do { + mutex_enter(&mca->mca_lock); + zio = zio_io_to_allocate(mca, &more); + mutex_exit(&mca->mca_lock); + if (zio == NULL) + return; - ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); - ASSERT0(zio->io_error); - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); + } while (more); } static zio_t * @@ -4126,15 +4109,13 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - if (zio->io_flags & ZIO_FLAG_NODATA) - flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* - * if not already chosen, locate an appropriate allocation class + * If not already chosen, choose an appropriate allocation class. */ mc = zio->io_metaslab_class; if (mc == NULL) { @@ -4143,6 +4124,7 @@ zio_dva_allocate(zio_t *zio) } ZIOSTAT_BUMP(ziostat_total_allocations); +again: /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. @@ -4157,7 +4139,7 @@ zio_dva_allocate(zio_t *zio) ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio, zio->io_allocator); + &zio->io_alloc_list, zio->io_allocator, zio); /* * Fallback to normal class when an alloc class is full @@ -4184,36 +4166,42 @@ zio_dva_allocate(zio_t *zio) } /* - * If throttling, transfer reservation over to normal class. - * The io_allocator slot can remain the same even though we - * are switching classes. + * If we are holding old class reservation, drop it. + * Dispatch the next ZIO(s) there if some are waiting. */ - if (mc->mc_alloc_throttle_enabled && - (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { - metaslab_class_throttle_unreserve(mc, - zio->io_prop.zp_copies, zio->io_allocator, zio); + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (metaslab_class_throttle_unreserve(mc, + zio->io_prop.zp_copies, zio)) { + zio_allocate_dispatch(zio->io_metaslab_class, + zio->io_allocator); + } zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; - - VERIFY(metaslab_class_throttle_reserve( - spa_normal_class(spa), - zio->io_prop.zp_copies, zio->io_allocator, zio, - flags | METASLAB_MUST_RESERVE)); } - zio->io_metaslab_class = mc = spa_normal_class(spa); + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } - + zio->io_metaslab_class = mc = spa_normal_class(spa); ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); - error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio, zio->io_allocator); + + /* + * If normal class uses throttling, return to that pipeline + * stage. Otherwise just do another allocation attempt. + */ + if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && + mc->mc_alloc_throttle_enabled && + zio->io_child_type != ZIO_CHILD_GANG && + !(zio->io_flags & ZIO_FLAG_NODATA)) { + zio->io_stage = ZIO_STAGE_DVA_THROTTLE >> 1; + return (zio); + } + goto again; } - if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { + if (error == ENOSPC && zio->io_size > spa->spa_min_alloc) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", @@ -4316,18 +4304,18 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, flags, &io_alloc_list, NULL, allocator); + txg, NULL, flags, &io_alloc_list, allocator, NULL); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, - new_bp, 1, txg, NULL, flags, - &io_alloc_list, NULL, allocator); + new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL); } if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, flags, - &io_alloc_list, NULL, allocator); + new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL); } metaslab_trace_fini(&io_alloc_list); @@ -5155,10 +5143,12 @@ zio_ready(zio_t *zio) * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ - metaslab_class_throttle_unreserve( + if (metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, - zio->io_allocator, zio); - zio_allocate_dispatch(zio->io_spa, zio->io_allocator); + zio)) { + zio_allocate_dispatch(zio->io_metaslab_class, + zio->io_allocator); + } } } @@ -5201,10 +5191,10 @@ zio_ready(zio_t *zio) static void zio_dva_throttle_done(zio_t *zio) { - zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; + const void *tag = pio; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); @@ -5215,48 +5205,33 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); - ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); - ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* - * Parents of gang children can have two flavors -- ones that - * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) - * and ones that allocated the constituent blocks. The allocation - * throttle needs to know the allocating parent zio so we must find - * it here. + * Parents of gang children can have two flavors -- ones that allocated + * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that + * allocated the constituent blocks. The first use their parent as tag. */ - if (pio->io_child_type == ZIO_CHILD_GANG) { - /* - * If our parent is a rewrite gang child then our grandparent - * would have been the one that performed the allocation. - */ - if (pio->io_flags & ZIO_FLAG_IO_REWRITE) - pio = zio_unique_parent(pio); - flags |= METASLAB_GANG_CHILD; - } + if (pio->io_child_type == ZIO_CHILD_GANG && + (pio->io_flags & ZIO_FLAG_IO_REWRITE)) + tag = zio_unique_parent(pio); - ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && + (pio->io_flags & ZIO_FLAG_IO_REWRITE))); ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); + ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); - mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, - pio->io_allocator, B_TRUE); - mutex_exit(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, + pio->io_allocator, flags, pio->io_size, tag); - metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, - pio->io_allocator, pio); - - /* - * Call into the pipeline to see if there is more work that - * needs to be done. If there is work to be done it will be - * dispatched to another taskq thread. - */ - zio_allocate_dispatch(zio->io_spa, pio->io_allocator); + if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) { + zio_allocate_dispatch(zio->io_metaslab_class, + pio->io_allocator); + } } static zio_t * @@ -5285,28 +5260,8 @@ zio_done(zio_t *zio) * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && - zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(zio->io_metaslab_class != NULL); - ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); + zio->io_child_type == ZIO_CHILD_VDEV) zio_dva_throttle_done(zio); - } - - /* - * If the allocation throttle is enabled, verify that - * we have decremented the refcounts for every I/O that was throttled. - */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(zio->io_bp != NULL); - ASSERT(ZIO_HAS_ALLOCATOR(zio)); - - metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, - zio->io_allocator); - VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> - mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); - } - for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++)