Fix metaslab group fragmentation math (#17037)

Since we are calculating a free space fragmentation, we should
weight metaslabs by the amount of their free space, not a full
size.  Fragmentation of full metaslabs may not matter in presence
empty ones.  The old algorithm did not differentiate metaslabs
having only one free 4KB block from metaslabs having 50% of space
free in 4KB blocks, reporting higher fragmentation.

While there, move metaslab_group_alloc_update() call after setting
mg_fragmentation, otherwise the effect may be delayed by one TXG.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
Alexander Motin 2025-02-18 13:45:42 -05:00 committed by Ameer Hamza
parent 1bdce0410c
commit f2ab5b82da

View File

@ -1176,9 +1176,8 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
} }
/* /*
* Calculate the fragmentation for a given metaslab group. We can use * Calculate the fragmentation for a given metaslab group. Weight metaslabs
* a simple average here since all metaslabs within the group must have * on the amount of free space. The return value will be between 0 and 100
* the same size. The return value will be a value between 0 and 100
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
* group have a fragmentation metric. * group have a fragmentation metric.
*/ */
@ -1187,24 +1186,29 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
{ {
vdev_t *vd = mg->mg_vd; vdev_t *vd = mg->mg_vd;
uint64_t fragmentation = 0; uint64_t fragmentation = 0;
uint64_t valid_ms = 0; uint64_t valid_ms = 0, total_ms = 0;
uint64_t free, total_free = 0;
for (int m = 0; m < vd->vdev_ms_count; m++) { for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m]; metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
if (msp->ms_group != mg) if (msp->ms_group != mg)
continue; continue;
total_ms++;
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
valid_ms++; valid_ms++;
fragmentation += msp->ms_fragmentation; free = (msp->ms_size - metaslab_allocated_space(msp)) /
SPA_MINBLOCKSIZE; /* To prevent overflows. */
total_free += free;
fragmentation += msp->ms_fragmentation * free;
} }
if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) if (valid_ms < (total_ms + 1) / 2 || total_free == 0)
return (ZFS_FRAG_INVALID); return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms; fragmentation /= total_free;
ASSERT3U(fragmentation, <=, 100); ASSERT3U(fragmentation, <=, 100);
return (fragmentation); return (fragmentation);
} }
@ -4469,8 +4473,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
spa_t *spa = mg->mg_class->mc_spa; spa_t *spa = mg->mg_class->mc_spa;
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
metaslab_group_alloc_update(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg);
metaslab_group_alloc_update(mg);
/* /*
* Preload the next potential metaslabs but only on active * Preload the next potential metaslabs but only on active