mirror of
				https://git.proxmox.com/git/mirror_zfs
				synced 2025-11-04 14:26:57 +00:00 
			
		
		
		
	This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #16598
		
			
				
	
	
		
			281 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			281 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * CDDL HEADER START
 | 
						|
 *
 | 
						|
 * The contents of this file are subject to the terms of the
 | 
						|
 * Common Development and Distribution License (the "License").
 | 
						|
 * You may not use this file except in compliance with the License.
 | 
						|
 *
 | 
						|
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 | 
						|
 * or https://opensource.org/licenses/CDDL-1.0.
 | 
						|
 * See the License for the specific language governing permissions
 | 
						|
 * and limitations under the License.
 | 
						|
 *
 | 
						|
 * When distributing Covered Code, include this CDDL HEADER in each
 | 
						|
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 | 
						|
 * If applicable, add the following below this CDDL HEADER, with the
 | 
						|
 * fields enclosed by brackets "[]" replaced with your own identifying
 | 
						|
 * information: Portions Copyright [yyyy] [name of copyright owner]
 | 
						|
 *
 | 
						|
 * CDDL HEADER END
 | 
						|
 */
 | 
						|
 | 
						|
/*
 | 
						|
 * Copyright (c) 2024, Klara Inc.
 | 
						|
 */
 | 
						|
 | 
						|
#include <sys/fs/zfs.h>
 | 
						|
#include <sys/types.h>
 | 
						|
#include <sys/sysmacros.h>
 | 
						|
#include <sys/string.h>
 | 
						|
#include <sys/debug.h>
 | 
						|
#include "zfs_valstr.h"
 | 
						|
 | 
						|
/*
 | 
						|
 * Each bit in a bitfield has three possible string representations:
 | 
						|
 * - single char
 | 
						|
 * - two-char pair
 | 
						|
 * - full name
 | 
						|
 */
 | 
						|
typedef struct {
 | 
						|
	const char	vb_bit;
 | 
						|
	const char	vb_pair[2];
 | 
						|
	const char	*vb_name;
 | 
						|
} valstr_bit_t;
 | 
						|
 | 
						|
/*
 | 
						|
 * Emits a character for each bit in `bits`, up to the number of elements
 | 
						|
 * in the table. Set bits get the character in vb_bit, clear bits get a
 | 
						|
 * space. This results in all strings having the same width, for easier
 | 
						|
 * visual comparison.
 | 
						|
 */
 | 
						|
static size_t
 | 
						|
valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
 | 
						|
    uint64_t bits, char *out, size_t outlen)
 | 
						|
{
 | 
						|
	ASSERT(out);
 | 
						|
	size_t n = 0;
 | 
						|
	for (int b = 0; b < nelems; b++) {
 | 
						|
		if (n == outlen)
 | 
						|
			break;
 | 
						|
		uint64_t mask = (1ULL << b);
 | 
						|
		out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
 | 
						|
	}
 | 
						|
	if (n < outlen)
 | 
						|
		out[n++] = '\0';
 | 
						|
	return (n);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
 | 
						|
 * separated by a `|` character. This gives a concise representation of the
 | 
						|
 * whole value.
 | 
						|
 */
 | 
						|
static size_t
 | 
						|
valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
 | 
						|
    uint64_t bits, char *out, size_t outlen)
 | 
						|
{
 | 
						|
	ASSERT(out);
 | 
						|
	size_t n = 0;
 | 
						|
	for (int b = 0; b < nelems; b++) {
 | 
						|
		ASSERT3U(n, <=, outlen);
 | 
						|
		if (n == outlen)
 | 
						|
			break;
 | 
						|
		uint64_t mask = (1ULL << b);
 | 
						|
		if (bits & mask) {
 | 
						|
			size_t len = (n > 0) ? 3 : 2;
 | 
						|
			if (n > outlen-len)
 | 
						|
				break;
 | 
						|
			if (n > 0)
 | 
						|
				out[n++] = '|';
 | 
						|
			out[n++] = table[b].vb_pair[0];
 | 
						|
			out[n++] = table[b].vb_pair[1];
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if (n < outlen)
 | 
						|
		out[n++] = '\0';
 | 
						|
	return (n);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Emits the full name for each bit set in `bits`, taken from vb_name, and
 | 
						|
 * separated by a space. This unambiguously shows the entire set of bits, but
 | 
						|
 * can get very long.
 | 
						|
 */
 | 
						|
static size_t
 | 
						|
valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
 | 
						|
    uint64_t bits, char *out, size_t outlen)
 | 
						|
{
 | 
						|
	ASSERT(out);
 | 
						|
	size_t n = 0;
 | 
						|
	for (int b = 0; b < nelems; b++) {
 | 
						|
		ASSERT3U(n, <=, outlen);
 | 
						|
		if (n == outlen)
 | 
						|
			break;
 | 
						|
		uint64_t mask = (1ULL << b);
 | 
						|
		if (bits & mask) {
 | 
						|
			size_t len = strlen(table[b].vb_name);
 | 
						|
			if (n > 0)
 | 
						|
				len++;
 | 
						|
			if (n > outlen-len)
 | 
						|
				break;
 | 
						|
			if (n > 0) {
 | 
						|
				out[n++] = ' ';
 | 
						|
				len--;
 | 
						|
			}
 | 
						|
			memcpy(&out[n], table[b].vb_name, len);
 | 
						|
			n += len;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if (n < outlen)
 | 
						|
		out[n++] = '\0';
 | 
						|
	return (n);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Emits the name of the given enum value in the table.
 | 
						|
 */
 | 
						|
static size_t
 | 
						|
valstr_enum_str(const char **table, const size_t nelems,
 | 
						|
    int v, char *out, size_t outlen)
 | 
						|
{
 | 
						|
	ASSERT(out);
 | 
						|
	ASSERT3U(v, <, nelems);
 | 
						|
	if (v >= nelems)
 | 
						|
		return (0);
 | 
						|
	return (MIN(strlcpy(out, table[v], outlen), outlen));
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * These macros create the string tables for the given name, and implement
 | 
						|
 * the public functions described in zfs_valstr.h.
 | 
						|
 */
 | 
						|
#define	_VALSTR_BITFIELD_IMPL(name, ...)				\
 | 
						|
static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
 | 
						|
size_t									\
 | 
						|
zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen)	\
 | 
						|
{									\
 | 
						|
	return (valstr_bitfield_bits(valstr_ ## name ## _table,		\
 | 
						|
	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 | 
						|
}									\
 | 
						|
									\
 | 
						|
size_t									\
 | 
						|
zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen)	\
 | 
						|
{									\
 | 
						|
	return (valstr_bitfield_pairs(valstr_ ## name ## _table,	\
 | 
						|
	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 | 
						|
}									\
 | 
						|
									\
 | 
						|
size_t									\
 | 
						|
zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen)		\
 | 
						|
{									\
 | 
						|
	return (valstr_bitfield_str(valstr_ ## name ## _table,		\
 | 
						|
	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 | 
						|
}									\
 | 
						|
 | 
						|
#define	_VALSTR_ENUM_IMPL(name, ...)					\
 | 
						|
static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ };	\
 | 
						|
size_t									\
 | 
						|
zfs_valstr_ ## name(int v, char *out, size_t outlen)			\
 | 
						|
{									\
 | 
						|
	return (valstr_enum_str(valstr_ ## name ## _table,		\
 | 
						|
	    ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen));	\
 | 
						|
}									\
 | 
						|
 | 
						|
 | 
						|
/* String tables */
 | 
						|
 | 
						|
/* ZIO flags: zio_flag_t, typically zio->io_flags */
 | 
						|
/* BEGIN CSTYLED */
 | 
						|
_VALSTR_BITFIELD_IMPL(zio_flag,
 | 
						|
	{ '.', "DA", "DONT_AGGREGATE" },
 | 
						|
	{ '.', "RP", "IO_REPAIR" },
 | 
						|
	{ '.', "SH", "SELF_HEAL" },
 | 
						|
	{ '.', "RS", "RESILVER" },
 | 
						|
	{ '.', "SC", "SCRUB" },
 | 
						|
	{ '.', "ST", "SCAN_THREAD" },
 | 
						|
	{ '.', "PH", "PHYSICAL" },
 | 
						|
	{ '.', "CF", "CANFAIL" },
 | 
						|
	{ '.', "SP", "SPECULATIVE" },
 | 
						|
	{ '.', "CW", "CONFIG_WRITER" },
 | 
						|
	{ '.', "DR", "DONT_RETRY" },
 | 
						|
	{ '?', "??", "[UNUSED 11]" },
 | 
						|
	{ '.', "ND", "NODATA" },
 | 
						|
	{ '.', "ID", "INDUCE_DAMAGE" },
 | 
						|
	{ '.', "AL", "IO_ALLOCATING" },
 | 
						|
	{ '.', "RE", "IO_RETRY" },
 | 
						|
	{ '.', "PR", "PROBE" },
 | 
						|
	{ '.', "TH", "TRYHARD" },
 | 
						|
	{ '.', "OP", "OPTIONAL" },
 | 
						|
	{ '.', "RD", "DIO_READ" },
 | 
						|
	{ '.', "DQ", "DONT_QUEUE" },
 | 
						|
	{ '.', "DP", "DONT_PROPAGATE" },
 | 
						|
	{ '.', "BY", "IO_BYPASS" },
 | 
						|
	{ '.', "RW", "IO_REWRITE" },
 | 
						|
	{ '.', "CM", "RAW_COMPRESS" },
 | 
						|
	{ '.', "EN", "RAW_ENCRYPT" },
 | 
						|
	{ '.', "GG", "GANG_CHILD" },
 | 
						|
	{ '.', "DD", "DDT_CHILD" },
 | 
						|
	{ '.', "GF", "GODFATHER" },
 | 
						|
	{ '.', "NP", "NOPWRITE" },
 | 
						|
	{ '.', "EX", "REEXECUTED" },
 | 
						|
	{ '.', "DG", "DELEGATED" },
 | 
						|
	{ '.', "DC", "DIO_CHKSUM_ERR" },
 | 
						|
)
 | 
						|
/* END CSTYLED */
 | 
						|
 | 
						|
/*
 | 
						|
 * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
 | 
						|
 *                        zio->io_pipeline.
 | 
						|
 */
 | 
						|
/* BEGIN CSTYLED */
 | 
						|
_VALSTR_BITFIELD_IMPL(zio_stage,
 | 
						|
	{ 'O', "O ", "OPEN" },
 | 
						|
	{ 'I', "RI", "READ_BP_INIT" },
 | 
						|
	{ 'I', "WI", "WRITE_BP_INIT" },
 | 
						|
	{ 'I', "FI", "FREE_BP_INIT" },
 | 
						|
	{ 'A', "IA", "ISSUE_ASYNC" },
 | 
						|
	{ 'W', "WC", "WRITE_COMPRESS" },
 | 
						|
	{ 'E', "EN", "ENCRYPT" },
 | 
						|
	{ 'C', "CG", "CHECKSUM_GENERATE" },
 | 
						|
	{ 'N', "NW", "NOP_WRITE" },
 | 
						|
	{ 'B', "BF", "BRT_FREE" },
 | 
						|
	{ 'd', "dS", "DDT_READ_START" },
 | 
						|
	{ 'd', "dD", "DDT_READ_DONE" },
 | 
						|
	{ 'd', "dW", "DDT_WRITE" },
 | 
						|
	{ 'd', "dF", "DDT_FREE" },
 | 
						|
	{ 'G', "GA", "GANG_ASSEMBLE" },
 | 
						|
	{ 'G', "GI", "GANG_ISSUE" },
 | 
						|
	{ 'D', "DT", "DVA_THROTTLE" },
 | 
						|
	{ 'D', "DA", "DVA_ALLOCATE" },
 | 
						|
	{ 'D', "DF", "DVA_FREE" },
 | 
						|
	{ 'D', "DC", "DVA_CLAIM" },
 | 
						|
	{ 'R', "R ", "READY" },
 | 
						|
	{ 'V', "VS", "VDEV_IO_START" },
 | 
						|
	{ 'V', "VD", "VDEV_IO_DONE" },
 | 
						|
	{ 'V', "VA", "VDEV_IO_ASSESS" },
 | 
						|
	{ 'C', "CV", "CHECKSUM_VERIFY" },
 | 
						|
	{ 'C', "DC", "DIO_CHECKSUM_VERIFY" },
 | 
						|
	{ 'X', "X ", "DONE" },
 | 
						|
)
 | 
						|
/* END CSTYLED */
 | 
						|
 | 
						|
/* ZIO priority: zio_priority_t, typically zio->io_priority */
 | 
						|
/* BEGIN CSTYLED */
 | 
						|
_VALSTR_ENUM_IMPL(zio_priority,
 | 
						|
	"SYNC_READ",
 | 
						|
	"SYNC_WRITE",
 | 
						|
	"ASYNC_READ",
 | 
						|
	"ASYNC_WRITE",
 | 
						|
	"SCRUB",
 | 
						|
	"REMOVAL",
 | 
						|
	"INITIALIZING",
 | 
						|
	"TRIM",
 | 
						|
	"REBUILD",
 | 
						|
	"[NUM_QUEUEABLE]",
 | 
						|
	"NOW",
 | 
						|
)
 | 
						|
/* END CSTYLED */
 | 
						|
 | 
						|
#undef _VALSTR_BITFIELD_IMPL
 | 
						|
#undef _VALSTR_ENUM_IMPL
 |