arc: avoid possible deadlock in arc_read

In l2arc_evict(), the config lock may be acquired in reverse order
(e.g., first the config lock (writer), then a hash lock) unlike in
arc_read() during scenarios like L2ARC device removal. To avoid
deadlocks, if the attempt to acquire the config lock (reader) fails
in arc_read(), release the hash lock, wait for the config lock, and
retry from the beginning.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #17071
This commit is contained in:
Ameer Hamza 2025-02-26 00:32:12 +05:00
parent 7e72312eff
commit 637f918211
6 changed files with 40 additions and 16 deletions

View File

@ -9043,7 +9043,7 @@ zdb_read_block(char *thing, spa_t *spa)
const blkptr_t *b = (const blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset);
if (zfs_blkptr_verify(spa, b,
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {
abd_return_buf_copy(pabd, buf, lsize);
borrowed = B_FALSE;
buf = lbuf;
@ -9052,7 +9052,7 @@ zdb_read_block(char *thing, spa_t *spa)
b = (const blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset);
if (lsize == -1 || zfs_blkptr_verify(spa, b,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
printf("invalid block pointer at this DVA\n");
goto out;
}

View File

@ -545,6 +545,7 @@ enum blk_verify_flag {
enum blk_config_flag {
BLK_CONFIG_HELD, // SCL_VDEV held for writer
BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV
};
@ -662,7 +663,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
extern int zio_resume(spa_t *spa);
extern void zio_resume_wait(spa_t *spa);
extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
/*

View File

@ -5568,6 +5568,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
arc_buf_t *buf = NULL;
int rc = 0;
boolean_t bp_validation = B_FALSE;
ASSERT(!embedded_bp ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@ -5610,7 +5611,7 @@ top:
* should always be the case since the blkptr is protected by
* a checksum.
*/
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
BLK_VERIFY_LOG)) {
mutex_exit(hash_lock);
rc = SET_ERROR(ECKSUM);
@ -5762,6 +5763,8 @@ top:
abd_t *hdr_abd;
int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
int config_lock;
int error;
if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
if (hash_lock != NULL)
@ -5770,16 +5773,31 @@ top:
goto done;
}
if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
config_lock = BLK_CONFIG_HELD;
} else if (hash_lock != NULL) {
/*
* Prevent lock order reversal
*/
config_lock = BLK_CONFIG_NEEDED_TRY;
} else {
config_lock = BLK_CONFIG_NEEDED;
}
/*
* Verify the block pointer contents are reasonable. This
* should always be the case since the blkptr is protected by
* a checksum.
*/
if (!zfs_blkptr_verify(spa, bp,
(zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
config_lock, BLK_VERIFY_LOG))) {
if (hash_lock != NULL)
mutex_exit(hash_lock);
if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
bp_validation = B_TRUE;
goto top;
}
rc = SET_ERROR(ECKSUM);
goto done;
}

View File

@ -2305,7 +2305,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
DMU_USERUSED_OBJECT, tx);
}
arc_buf_destroy(buf, &buf);
} else if (!zfs_blkptr_verify(spa, bp,
} else if (zfs_blkptr_verify(spa, bp,
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
/*
* Sanity check the block pointer contents, this is handled

View File

@ -2778,7 +2778,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
* When damaged consider it to be a metadata error since we cannot
* trust the BP_GET_TYPE and BP_GET_LEVEL values.
*/
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
atomic_inc_64(&sle->sle_meta_count);
return (0);
}

View File

@ -1164,7 +1164,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
* it only contains known object types, checksum/compression identifiers,
* block sizes within the maximum allowed limits, valid DVAs, etc.
*
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
* If everything checks out 0 is returned. The zfs_blkptr_verify
* argument controls the behavior when an invalid field is detected.
*
* Values for blk_verify_flag:
@ -1179,7 +1179,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
* BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
* performance
*/
boolean_t
int
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
{
@ -1211,7 +1211,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
"blkptr at %px has invalid PSIZE %llu",
bp, (longlong_t)BPE_GET_PSIZE(bp));
}
return (errors == 0);
return (errors ? ECKSUM : 0);
}
if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
@ -1229,7 +1229,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
* will be done once the zio is executed in vdev_mirror_map_alloc.
*/
if (unlikely(!spa->spa_trust_config))
return (errors == 0);
return (errors ? ECKSUM : 0);
switch (blk_config) {
case BLK_CONFIG_HELD:
@ -1238,8 +1238,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
case BLK_CONFIG_NEEDED:
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
break;
case BLK_CONFIG_NEEDED_TRY:
if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER))
return (EBUSY);
break;
case BLK_CONFIG_SKIP:
return (errors == 0);
return (errors ? ECKSUM : 0);
default:
panic("invalid blk_config %u", blk_config);
}
@ -1294,10 +1298,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
bp, i, (longlong_t)offset);
}
}
if (blk_config == BLK_CONFIG_NEEDED)
if (blk_config == BLK_CONFIG_NEEDED || blk_config ==
BLK_CONFIG_NEEDED_TRY)
spa_config_exit(spa, SCL_VDEV, bp);
return (errors == 0);
return (errors ? ECKSUM : 0);
}
boolean_t