mirror of
https://git.proxmox.com/git/mirror_zfs
synced 2025-04-28 11:40:17 +00:00
Too many vdev probe errors should suspend pool
Similar to what we saw in #16569, we need to consider that a replacing vdev should not be considered as fully contributing to the redundancy of a raidz vdev even though current IO has enough redundancy. When a failed vdev_probe() is faulting a disk, it now checks if that disk is required, and if so it suspends the pool until the admin can return the missing disks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Don Brady <don.brady@klarasystems.com> Closes #16864
This commit is contained in:
parent
47b7dc976b
commit
25565403aa
@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
|
spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
|
||||||
{
|
{
|
||||||
if (vd->vdev_fault_wanted) {
|
if (vd->vdev_fault_wanted) {
|
||||||
|
vdev_state_t newstate = VDEV_STATE_FAULTED;
|
||||||
vd->vdev_fault_wanted = B_FALSE;
|
vd->vdev_fault_wanted = B_FALSE;
|
||||||
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
|
|
||||||
VDEV_AUX_ERR_EXCEEDED);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this device has the only valid copy of the data, then
|
||||||
|
* back off and simply mark the vdev as degraded instead.
|
||||||
|
*/
|
||||||
|
if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
|
||||||
|
vdev_dtl_required(vd)) {
|
||||||
|
newstate = VDEV_STATE_DEGRADED;
|
||||||
|
/* A required disk is missing so suspend the pool */
|
||||||
|
*suspend = B_TRUE;
|
||||||
|
}
|
||||||
|
vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
|
||||||
|
}
|
||||||
for (int c = 0; c < vd->vdev_children; c++)
|
for (int c = 0; c < vd->vdev_children; c++)
|
||||||
spa_async_fault_vdev(spa, vd->vdev_child[c]);
|
spa_async_fault_vdev(vd->vdev_child[c], suspend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -9049,8 +9059,11 @@ spa_async_thread(void *arg)
|
|||||||
*/
|
*/
|
||||||
if (tasks & SPA_ASYNC_FAULT_VDEV) {
|
if (tasks & SPA_ASYNC_FAULT_VDEV) {
|
||||||
spa_vdev_state_enter(spa, SCL_NONE);
|
spa_vdev_state_enter(spa, SCL_NONE);
|
||||||
spa_async_fault_vdev(spa, spa->spa_root_vdev);
|
boolean_t suspend = B_FALSE;
|
||||||
|
spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
|
||||||
(void) spa_vdev_state_exit(spa, NULL, 0);
|
(void) spa_vdev_state_exit(spa, NULL, 0);
|
||||||
|
if (suspend)
|
||||||
|
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
|
|||||||
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
|
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
|
||||||
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
|
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
|
||||||
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
|
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
|
||||||
'fault_limits', 'scrub_after_resilver', 'suspend_resume_single',
|
'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
|
||||||
'zpool_status_-s']
|
'suspend_resume_single', 'zpool_status_-s']
|
||||||
tags = ['functional', 'fault']
|
tags = ['functional', 'fault']
|
||||||
|
|
||||||
[tests/functional/features/large_dnode:Linux]
|
[tests/functional/features/large_dnode:Linux]
|
||||||
|
@ -1531,6 +1531,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||||||
functional/fault/decrypt_fault.ksh \
|
functional/fault/decrypt_fault.ksh \
|
||||||
functional/fault/fault_limits.ksh \
|
functional/fault/fault_limits.ksh \
|
||||||
functional/fault/scrub_after_resilver.ksh \
|
functional/fault/scrub_after_resilver.ksh \
|
||||||
|
functional/fault/suspend_on_probe_errors.ksh \
|
||||||
functional/fault/suspend_resume_single.ksh \
|
functional/fault/suspend_resume_single.ksh \
|
||||||
functional/fault/setup.ksh \
|
functional/fault/setup.ksh \
|
||||||
functional/fault/zpool_status_-s.ksh \
|
functional/fault/zpool_status_-s.ksh \
|
||||||
|
154
tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh
Executable file
154
tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh
Executable file
@ -0,0 +1,154 @@
|
|||||||
|
#!/bin/ksh -p
|
||||||
|
#
|
||||||
|
# CDDL HEADER START
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the terms of the
|
||||||
|
# Common Development and Distribution License (the "License").
|
||||||
|
# You may not use this file except in compliance with the License.
|
||||||
|
#
|
||||||
|
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
# or https://opensource.org/licenses/CDDL-1.0.
|
||||||
|
# See the License for the specific language governing permissions
|
||||||
|
# and limitations under the License.
|
||||||
|
#
|
||||||
|
# When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
# If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
#
|
||||||
|
# CDDL HEADER END
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2024, Klara Inc.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/include/blkdev.shlib
|
||||||
|
|
||||||
|
#
|
||||||
|
# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Disable ZED -- this test is focused on vdev_probe errors
|
||||||
|
# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug)
|
||||||
|
# 3. Add some data to it for a resilver workload
|
||||||
|
# 4. Replace one of the child vdevs to start a replacing vdev
|
||||||
|
# 5. During the resilver, remove 4 disks including one from the replacing vdev
|
||||||
|
# 6. Verify that the pool is suspended (it used to remain online)
|
||||||
|
#
|
||||||
|
|
||||||
|
DEV_SIZE_MB=1024
|
||||||
|
|
||||||
|
FILE_VDEV_CNT=8
|
||||||
|
FILE_VDEV_SIZ=256M
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
destroy_pool $TESTPOOL
|
||||||
|
if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then
|
||||||
|
log_must eval "echo running > /sys/block/$sd/device/state"
|
||||||
|
fi
|
||||||
|
unload_scsi_debug
|
||||||
|
rm -f $DATA_FILE
|
||||||
|
for i in {0..$((FILE_VDEV_CNT - 1))}; do
|
||||||
|
log_must rm -f "$TEST_BASE_DIR/dev-$i"
|
||||||
|
done
|
||||||
|
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
|
||||||
|
zed_start
|
||||||
|
}
|
||||||
|
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
log_assert "VDEV probe errors for more disks than parity should suspend a pool"
|
||||||
|
|
||||||
|
log_note "Stoping ZED process"
|
||||||
|
zed_stop
|
||||||
|
zpool events -c
|
||||||
|
|
||||||
|
# Make a debug device that we can "unplug" and lose 4 drives at once
|
||||||
|
unload_scsi_debug
|
||||||
|
load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b'
|
||||||
|
sd=$(get_debug_device)
|
||||||
|
|
||||||
|
# Create 4 partitions that match the FILE_VDEV_SIZ
|
||||||
|
parted "/dev/${sd}" --script mklabel gpt
|
||||||
|
parted "/dev/${sd}" --script mkpart primary 0% 25%
|
||||||
|
parted "/dev/${sd}" --script mkpart primary 25% 50%
|
||||||
|
parted "/dev/${sd}" --script mkpart primary 50% 75%
|
||||||
|
parted "/dev/${sd}" --script mkpart primary 75% 100%
|
||||||
|
block_device_wait "/dev/${sd}"
|
||||||
|
blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4"
|
||||||
|
|
||||||
|
# Create 8 file vdevs
|
||||||
|
typeset -a filedevs
|
||||||
|
for i in {0..$((FILE_VDEV_CNT - 1))}; do
|
||||||
|
device=$TEST_BASE_DIR/dev-$i
|
||||||
|
log_must truncate -s $FILE_VDEV_SIZ $device
|
||||||
|
# Use all but the last one for pool create
|
||||||
|
if [[ $i -lt "7" ]]; then
|
||||||
|
filedevs[${#filedevs[*]}+1]=$device
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Create a raidz-3 pool that we can pull 4 disks from
|
||||||
|
log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs
|
||||||
|
sync_pool $TESTPOOL
|
||||||
|
|
||||||
|
# Add some data to the pool
|
||||||
|
log_must zfs create $TESTPOOL/fs
|
||||||
|
MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
|
||||||
|
SECONDS=0
|
||||||
|
log_must fill_fs $MNTPOINT 1 200 4096 10 Z
|
||||||
|
log_note "fill_fs took $SECONDS seconds"
|
||||||
|
sync_pool $TESTPOOL
|
||||||
|
|
||||||
|
# Start a replacing vdev, but suspend the resilver
|
||||||
|
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
|
||||||
|
log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7
|
||||||
|
|
||||||
|
# Remove 4 disks all at once
|
||||||
|
log_must eval "echo offline > /sys/block/${sd}/device/state"
|
||||||
|
|
||||||
|
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
|
||||||
|
|
||||||
|
# Add some writes to drive the vdev probe errors
|
||||||
|
log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1
|
||||||
|
|
||||||
|
# Wait until sync starts, and the pool suspends
|
||||||
|
log_note "waiting for pool to suspend"
|
||||||
|
typeset -i tries=30
|
||||||
|
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
|
||||||
|
if ((tries-- == 0)); then
|
||||||
|
zpool status -s
|
||||||
|
log_fail "UNEXPECTED -- pool did not suspend"
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
log_note $(cat /proc/spl/kstat/zfs/$TESTPOOL/state)
|
||||||
|
|
||||||
|
# Put the missing disks back into service
|
||||||
|
log_must eval "echo running > /sys/block/$sd/device/state"
|
||||||
|
|
||||||
|
# Clear the vdev error states, which will reopen the vdevs and resume the pool
|
||||||
|
log_must zpool clear $TESTPOOL
|
||||||
|
|
||||||
|
# Wait until the pool resumes
|
||||||
|
log_note "waiting for pool to resume"
|
||||||
|
tries=30
|
||||||
|
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do
|
||||||
|
if ((tries-- == 0)); then
|
||||||
|
log_fail "pool did not resume"
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
log_must zpool wait -t resilver $TESTPOOL
|
||||||
|
sync_pool $TESTPOOL
|
||||||
|
|
||||||
|
# Make sure a pool scrub comes back clean
|
||||||
|
log_must zpool scrub -w $TESTPOOL
|
||||||
|
log_must zpool status -v $TESTPOOL
|
||||||
|
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
|
||||||
|
|
||||||
|
log_pass "VDEV probe errors for more disks than parity should suspend a pool"
|
Loading…
Reference in New Issue
Block a user