mirror of
https://git.proxmox.com/git/mirror_zfs
synced 2025-05-01 07:54:16 +00:00
Fix deadlock on I/O errors during device removal
spa_vdev_remove_thread() should not hold svr_lock while loading a metaslab. It may block ZIO threads, required to handle metaslab loading, at least in case of read errors causing recovery writes. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Rob Norris <robn@despairlabs.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #17145
This commit is contained in:
parent
83fa051ceb
commit
676b7ef104
@ -1621,6 +1621,9 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
vca.vca_read_error_bytes = 0;
|
vca.vca_read_error_bytes = 0;
|
||||||
vca.vca_write_error_bytes = 0;
|
vca.vca_write_error_bytes = 0;
|
||||||
|
|
||||||
|
zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||||
|
NULL, 0, 0);
|
||||||
|
|
||||||
mutex_enter(&svr->svr_lock);
|
mutex_enter(&svr->svr_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1633,7 +1636,9 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
metaslab_t *msp = vd->vdev_ms[msi];
|
metaslab_t *msp = vd->vdev_ms[msi];
|
||||||
ASSERT3U(msi, <=, vd->vdev_ms_count);
|
ASSERT3U(msi, <=, vd->vdev_ms_count);
|
||||||
|
|
||||||
|
again:
|
||||||
ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
|
ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
|
||||||
|
mutex_exit(&svr->svr_lock);
|
||||||
|
|
||||||
mutex_enter(&msp->ms_sync_lock);
|
mutex_enter(&msp->ms_sync_lock);
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
@ -1646,18 +1651,32 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the metaslab has ever been allocated from (ms_sm!=NULL),
|
* If the metaslab has ever been synced (ms_sm != NULL),
|
||||||
* read the allocated segments from the space map object
|
* read the allocated segments from the space map object
|
||||||
* into svr_allocd_segs. Since we do this while holding
|
* into svr_allocd_segs. Since we do this while holding
|
||||||
* svr_lock and ms_sync_lock, concurrent frees (which
|
* ms_lock and ms_sync_lock, concurrent frees (which
|
||||||
* would have modified the space map) will wait for us
|
* would have modified the space map) will wait for us
|
||||||
* to finish loading the spacemap, and then take the
|
* to finish loading the spacemap, and then take the
|
||||||
* appropriate action (see free_from_removing_vdev()).
|
* appropriate action (see free_from_removing_vdev()).
|
||||||
*/
|
*/
|
||||||
if (msp->ms_sm != NULL) {
|
if (msp->ms_sm != NULL)
|
||||||
VERIFY0(space_map_load(msp->ms_sm,
|
VERIFY0(space_map_load(msp->ms_sm, segs, SM_ALLOC));
|
||||||
svr->svr_allocd_segs, SM_ALLOC));
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We could not hold svr_lock while loading space map, or we
|
||||||
|
* could hit deadlock in a ZIO pipeline, having to wait for
|
||||||
|
* it. But we can not block for it here under metaslab locks,
|
||||||
|
* or it would be a lock ordering violation.
|
||||||
|
*/
|
||||||
|
if (!mutex_tryenter(&svr->svr_lock)) {
|
||||||
|
mutex_exit(&msp->ms_lock);
|
||||||
|
mutex_exit(&msp->ms_sync_lock);
|
||||||
|
zfs_range_tree_vacate(segs, NULL, NULL);
|
||||||
|
mutex_enter(&svr->svr_lock);
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
|
||||||
|
zfs_range_tree_swap(&segs, &svr->svr_allocd_segs);
|
||||||
zfs_range_tree_walk(msp->ms_unflushed_allocs,
|
zfs_range_tree_walk(msp->ms_unflushed_allocs,
|
||||||
zfs_range_tree_add, svr->svr_allocd_segs);
|
zfs_range_tree_add, svr->svr_allocd_segs);
|
||||||
zfs_range_tree_walk(msp->ms_unflushed_frees,
|
zfs_range_tree_walk(msp->ms_unflushed_frees,
|
||||||
@ -1665,16 +1684,15 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
zfs_range_tree_walk(msp->ms_freeing,
|
zfs_range_tree_walk(msp->ms_freeing,
|
||||||
zfs_range_tree_remove, svr->svr_allocd_segs);
|
zfs_range_tree_remove, svr->svr_allocd_segs);
|
||||||
|
|
||||||
|
mutex_exit(&msp->ms_lock);
|
||||||
|
mutex_exit(&msp->ms_sync_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When we are resuming from a paused removal (i.e.
|
* When we are resuming from a paused removal (i.e.
|
||||||
* when importing a pool with a removal in progress),
|
* when importing a pool with a removal in progress),
|
||||||
* discard any state that we have already processed.
|
* discard any state that we have already processed.
|
||||||
*/
|
*/
|
||||||
zfs_range_tree_clear(svr->svr_allocd_segs, 0,
|
zfs_range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
|
||||||
start_offset);
|
|
||||||
}
|
|
||||||
mutex_exit(&msp->ms_lock);
|
|
||||||
mutex_exit(&msp->ms_sync_lock);
|
|
||||||
|
|
||||||
vca.vca_msp = msp;
|
vca.vca_msp = msp;
|
||||||
zfs_dbgmsg("copying %llu segments for metaslab %llu",
|
zfs_dbgmsg("copying %llu segments for metaslab %llu",
|
||||||
@ -1751,6 +1769,8 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
|
|
||||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||||
|
|
||||||
|
zfs_range_tree_destroy(segs);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait for all copies to finish before cleaning up the vca.
|
* Wait for all copies to finish before cleaning up the vca.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user