From f2ab5b82da20d1c49d1e8884cb79c4623916de1a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 18 Feb 2025 13:45:42 -0500 Subject: [PATCH] Fix metaslab group fragmentation math (#17037) Since we are calculating a free space fragmentation, we should weight metaslabs by the amount of their free space, not a full size. Fragmentation of full metaslabs may not matter in presence empty ones. The old algorithm did not differentiate metaslabs having only one free 4KB block from metaslabs having 50% of space free in 4KB blocks, reporting higher fragmentation. While there, move metaslab_group_alloc_update() call after setting mg_fragmentation, otherwise the effect may be delayed by one TXG. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Reviewed-by: Paul Dagnelie Reviewed-by: Tony Nguyen Reviewed-by: Tony Hutter --- module/zfs/metaslab.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index e3c9afbd6..35bd968f6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1176,9 +1176,8 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* - * Calculate the fragmentation for a given metaslab group. We can use - * a simple average here since all metaslabs within the group must have - * the same size. The return value will be a value between 0 and 100 + * Calculate the fragmentation for a given metaslab group. Weight metaslabs + * on the amount of free space. The return value will be between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ @@ -1187,24 +1186,29 @@ metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; - uint64_t valid_ms = 0; + uint64_t valid_ms = 0, total_ms = 0; + uint64_t free, total_free = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_fragmentation == ZFS_FRAG_INVALID) - continue; if (msp->ms_group != mg) continue; + total_ms++; + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; valid_ms++; - fragmentation += msp->ms_fragmentation; + free = (msp->ms_size - metaslab_allocated_space(msp)) / + SPA_MINBLOCKSIZE; /* To prevent overflows. */ + total_free += free; + fragmentation += msp->ms_fragmentation * free; } - if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) + if (valid_ms < (total_ms + 1) / 2 || total_free == 0) return (ZFS_FRAG_INVALID); - fragmentation /= valid_ms; + fragmentation /= total_free; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } @@ -4469,8 +4473,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); + metaslab_group_alloc_update(mg); /* * Preload the next potential metaslabs but only on active