From 3b5c3f52d2aa79075a96f18d63547cea2c79ac93 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 5 Feb 2025 00:47:50 +1100 Subject: [PATCH] zio: lock parent zios when updating wait counts on reexecute As zios are reexecuted after resume from suspension, their ready and wait states need to be propagated to wait counts on all their parents. It's possible for those parents to have active children passing through READY or DONE, which then end up in zio_notify_parent(), take their parent's lock, and decrement the wait count. Without also taking a lock here, it's possible for an increment race to occur, which leads to either there being no references left (tripping the assert in zio_notify_parent()), or a parent waiting forever for a nonexistent child to complete. To protect against this, we simply take the appropriate zio locks in zio_reexecute() before updating the wait counts. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17016 --- module/zfs/zio.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bd6752f00..ae5340da9 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -23,7 +23,7 @@ * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, 2023, 2024, Klara Inc. + * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. * Copyright (c) 2021, 2024 by George Melikov. All rights reserved. @@ -2537,13 +2537,29 @@ zio_reexecute(void *arg) pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || (pio->io_pipeline & ZIO_STAGE_READY) == 0; pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); + + /* + * It's possible for a failed ZIO to be a descendant of more than one + * ZIO tree. When reexecuting it, we have to be sure to add its wait + * states to all parent wait counts. + * + * Those parents, in turn, may have other children that are currently + * active, usually because they've already been reexecuted after + * resuming. Those children may be executing and may call + * zio_notify_parent() at the same time as we're updating our parent's + * counts. To avoid races while updating the counts, we take + * gio->io_lock before each update. + */ zio_link_t *zl = NULL; while ((gio = zio_walk_parents(pio, &zl)) != NULL) { + mutex_enter(&gio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) { gio->io_children[pio->io_child_type][w] += !pio->io_state[w]; } + mutex_exit(&gio->io_lock); } + for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0;