zfs/cmd/zilstat.in
Rob Norris 72602f6ad9 ZIL: "crash" the ZIL if the pool suspends during fallback
If the ZIL runs into trouble, it calls txg_wait_synced(), which blocks
on suspend. We want it to not block on suspend, instead returning an
error. On the surface, this is simple: change all calls to
txg_wait_synced_flags(TXG_WAIT_SUSPEND), and then thread the error
return back to the zil_commit() caller.

Handling suspension means returning an error to all commit waiters. This
is relatively straightforward, as zil_commit_waiter_t already has
zcw_zio_error to hold the write IO error, which signals a fallback to
txg_wait_synced_flags(TXG_WAIT_SUSPEND), which will fail, and so the
waiter can now return an error from zil_commit().

However, commit waiters are normally signalled when their associated
write (LWB) completes. If the pool has suspended, those IOs may not
return for some time, or maybe not at all. We still want to signal those
waiters so they can return from zil_commit(). We have a list of those
in-flight LWBs on zl_lwb_list, so we can run through those, detach them
and signal them. The LWB itself is still in-flight, but no longer has
attached waiters, so when it returns there will be nothing to do.

(As an aside, ITXs can also supply completion callbacks, which are
called when they are destroyed. These are directly connected to LWBs
though, so are passed the error code and destroyed there too).

At this point, all ZIL waiters have been ejected, so we only have to
consider the internal state. We potentially still have ITXs that have
not been committed, LWBs still open, and LWBs in-flight. The on-disk ZIL
is in an unknown state; some writes may have been written but not
returned to us. We really can't rely on any of it; the best thing to do
is abandon it entirely and start over when the pool returns to service.
But, since we may have IO out that won't return until the pool resumes,
we need something for it to return to.

The simplest solution I could find, implemented here, is to "crash" the
ZIL: accept no new ITXs, make no further updates, and let it empty out
on its normal schedule, that is, as txgs complete and zil_sync() and
zil_clean() are called. We set a "restart txg" to three txgs in the
future (syncing + TXG_CONCURRENT_STATES), at which point all the
internal state will have been cleared out, and the ZIL can resume
operation (handled at the top of zil_clean()).

This commit adds zil_crash(), which handles all of the above:
 - sets the restart txg
 - capture and signal all waiters
 - zero the header

zil_crash() is called when txg_wait_synced_flags(TXG_WAIT_SUSPEND)
returns because the pool suspended (ESHUTDOWN).

The rest of the commit is just threading the errors through, and related
housekeeping.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17398
2025-08-08 16:43:26 -07:00

557 lines
16 KiB
Plaintext
Executable File

#!/usr/bin/env @PYTHON_SHEBANG@
# SPDX-License-Identifier: CDDL-1.0
#
# Print out statistics for all zil stats. This information is
# available through the zil kstat.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License, Version 1.0 only
# (the "License"). You may not use this file except in compliance
# with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# This script must remain compatible with Python 3.6+.
#
import sys
import subprocess
import time
import copy
import os
import re
import signal
from collections import defaultdict
import argparse
from argparse import RawTextHelpFormatter
cols = {
# hdr: [size, scale, kstat name]
"time": [8, -1, "time"],
"pool": [12, -1, "pool"],
"ds": [12, -1, "dataset_name"],
"obj": [12, -1, "objset"],
"cc": [5, 1000, "zil_commit_count"],
"cwc": [5, 1000, "zil_commit_writer_count"],
"cec": [5, 1000, "zil_commit_error_count"],
"csc": [5, 1000, "zil_commit_stall_count"],
"cSc": [5, 1000, "zil_commit_suspend_count"],
"cCc": [5, 1000, "zil_commit_crash_count"],
"ic": [5, 1000, "zil_itx_count"],
"iic": [5, 1000, "zil_itx_indirect_count"],
"iib": [5, 1024, "zil_itx_indirect_bytes"],
"icc": [5, 1000, "zil_itx_copied_count"],
"icb": [5, 1024, "zil_itx_copied_bytes"],
"inc": [5, 1000, "zil_itx_needcopy_count"],
"inb": [5, 1024, "zil_itx_needcopy_bytes"],
"idc": [5, 1000, "icc+inc"],
"idb": [5, 1024, "icb+inb"],
"iwc": [5, 1000, "iic+idc"],
"iwb": [5, 1024, "iib+idb"],
"imnc": [6, 1000, "zil_itx_metaslab_normal_count"],
"imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"],
"imnw": [6, 1024, "zil_itx_metaslab_normal_write"],
"imna": [6, 1024, "zil_itx_metaslab_normal_alloc"],
"imsc": [6, 1000, "zil_itx_metaslab_slog_count"],
"imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"],
"imsw": [6, 1024, "zil_itx_metaslab_slog_write"],
"imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"],
"imc": [5, 1000, "imnc+imsc"],
"imb": [5, 1024, "imnb+imsb"],
"imw": [5, 1024, "imnw+imsw"],
"ima": [5, 1024, "imna+imsa"],
"se%": [3, 100, "imb/ima"],
"sen%": [4, 100, "imnb/imna"],
"ses%": [4, 100, "imsb/imsa"],
"te%": [3, 100, "imb/imw"],
"ten%": [4, 100, "imnb/imnw"],
"tes%": [4, 100, "imsb/imsw"],
}
hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]")
curr = {}
diff = {}
kstat = {}
ds_pairs = {}
pool_name = None
dataset_name = None
interval = 0
sep = " "
gFlag = True
dsFlag = False
def prettynum(sz, scale, num=0):
suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
index = 0
save = 0
if scale == -1:
return "%*s" % (sz, num)
# Rounding error, return 0
elif 0 < num < 1:
num = 0
while num > scale and index < 5:
save = num
num = num / scale
index += 1
if index == 0:
return "%*d" % (sz, num)
if (save / scale) < 10:
return "%*.1f%s" % (sz - 1, num, suffix[index])
else:
return "%*d%s" % (sz - 1, num, suffix[index])
def print_header():
global hdr
global sep
for col in hdr:
new_col = col
if interval > 0 and cols[col][1] > 100:
new_col += "/s"
sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep))
sys.stdout.write("\n")
def print_values(v):
global hdr
global sep
for col in hdr:
val = v[cols[col][2]]
if interval > 0 and cols[col][1] > 100:
val = v[cols[col][2]] // interval
sys.stdout.write("%s%s" % (
prettynum(cols[col][0], cols[col][1], val), sep))
sys.stdout.write("\n")
def print_dict(d):
for pool in d:
for objset in d[pool]:
print_values(d[pool][objset])
def detailed_usage():
sys.stderr.write("%s\n" % cmd)
sys.stderr.write("Field definitions are as follows:\n")
for key in cols:
sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
sys.stderr.write("\n")
sys.exit(0)
def init():
global pool_name
global dataset_name
global interval
global hdr
global curr
global gFlag
global sep
curr = dict()
parser = argparse.ArgumentParser(description='Program to print zilstats',
add_help=True,
formatter_class=RawTextHelpFormatter,
epilog="\nUsage Examples\n"\
"Note: Global zilstats is shown by default,"\
" if none of a|p|d option is not provided\n"\
"\tzilstat -a\n"\
'\tzilstat -v\n'\
'\tzilstat -p tank\n'\
'\tzilstat -d tank/d1,tank/d2,tank/zv1\n'\
'\tzilstat -i 1\n'\
'\tzilstat -s \"***\"\n'\
'\tzilstat -f zcwc,zimnb,zimsb\n')
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="List field headers and definitions"
)
pool_grp = parser.add_mutually_exclusive_group()
pool_grp.add_argument(
"-a", "--all",
action="store_true",
dest="all",
help="Print all dataset stats"
)
pool_grp.add_argument(
"-p", "--pool",
type=str,
help="Print stats for all datasets of a speicfied pool"
)
pool_grp.add_argument(
"-d", "--dataset",
type=str,
help="Print given dataset(s) (Comma separated)"
)
parser.add_argument(
"-f", "--columns",
type=str,
help="Specify specific fields to print (see -v)"
)
parser.add_argument(
"-s", "--separator",
type=str,
help="Override default field separator with custom "
"character or string"
)
parser.add_argument(
"-i", "--interval",
type=int,
dest="interval",
help="Print stats between specified interval"
" (in seconds)"
)
parsed_args = parser.parse_args()
if parsed_args.verbose:
detailed_usage()
if parsed_args.all:
gFlag = False
if parsed_args.interval:
interval = parsed_args.interval
if parsed_args.pool:
pool_name = parsed_args.pool
gFlag = False
if parsed_args.dataset:
dataset_name = parsed_args.dataset
gFlag = False
if parsed_args.separator:
sep = parsed_args.separator
if gFlag:
hdr = ghdr
if parsed_args.columns:
hdr = parsed_args.columns.split(",")
invalid = []
for ele in hdr:
if ele not in cols:
invalid.append(ele)
if len(invalid) > 0:
sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
sys.exit(1)
if pool_name and dataset_name:
print ("Error: Can not filter both dataset and pool")
sys.exit(1)
def FileCheck(fname):
try:
return (open(fname))
except IOError:
print ("Unable to open zilstat proc file: " + fname)
sys.exit(1)
if sys.platform.startswith('freebsd'):
# Requires py-sysctl on FreeBSD
import sysctl
def kstat_update(pool = None, objid = None):
global kstat
kstat = {}
if not pool:
file = "kstat.zfs.misc.zil"
k = [ctl for ctl in sysctl.filter(file) \
if ctl.type != sysctl.CTLTYPE_NODE]
kstat_process_str(k, file, "GLOBAL", len(file + "."))
elif objid:
file = "kstat.zfs." + pool + ".dataset.objset-" + objid
k = [ctl for ctl in sysctl.filter(file) if ctl.type \
!= sysctl.CTLTYPE_NODE]
kstat_process_str(k, file, objid, len(file + "."))
else:
file = "kstat.zfs." + pool + ".dataset"
zil_start = len(file + ".")
obj_start = len("kstat.zfs." + pool + ".")
k = [ctl for ctl in sysctl.filter(file)
if ctl.type != sysctl.CTLTYPE_NODE]
for s in k:
if not s or (s.name.find("zil") == -1 and \
s.name.find("dataset_name") == -1):
continue
name, value = s.name, s.value
objid = re.findall(r'0x[0-9A-F]+', \
name[obj_start:], re.I)[0]
if objid not in kstat:
kstat[objid] = dict()
zil_start = len(file + ".objset-" + \
objid + ".")
kstat[objid][name[zil_start:]] = value \
if (name.find("dataset_name")) \
else int(value)
def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0):
global kstat
if not k:
print("Unable to process kstat for: " + file)
sys.exit(1)
kstat[objset] = dict()
for s in k:
if not s or (s.name.find("zil") == -1 and \
s.name.find("dataset_name") == -1):
continue
name, value = s.name, s.value
kstat[objset][name[zil_start:]] = value \
if (name.find("dataset_name")) else int(value)
elif sys.platform.startswith('linux'):
def kstat_update(pool = None, objid = None):
global kstat
kstat = {}
if not pool:
k = [line.strip() for line in \
FileCheck("/proc/spl/kstat/zfs/zil")]
kstat_process_str(k, "/proc/spl/kstat/zfs/zil")
elif objid:
file = "/proc/spl/kstat/zfs/" + pool + "/objset-" + objid
k = [line.strip() for line in FileCheck(file)]
kstat_process_str(k, file, objid)
else:
if not os.path.exists(f"/proc/spl/kstat/zfs/{pool}"):
print("Pool \"" + pool + "\" does not exist, Exitting")
sys.exit(1)
objsets = os.listdir(f'/proc/spl/kstat/zfs/{pool}')
for objid in objsets:
if objid.find("objset-") == -1:
continue
file = "/proc/spl/kstat/zfs/" + pool + "/" + objid
k = [line.strip() for line in FileCheck(file)]
kstat_process_str(k, file, objid.replace("objset-", ""))
def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0):
global kstat
if not k:
print("Unable to process kstat for: " + file)
sys.exit(1)
kstat[objset] = dict()
for s in k:
if not s or (s.find("zil") == -1 and \
s.find("dataset_name") == -1):
continue
name, unused, value = s.split()
kstat[objset][name] = value \
if (name == "dataset_name") else int(value)
def zil_process_kstat():
global curr, pool_name, dataset_name, dsFlag, ds_pairs
curr.clear()
if gFlag == True:
kstat_update()
zil_build_dict()
else:
if pool_name:
kstat_update(pool_name)
zil_build_dict(pool_name)
elif dataset_name:
if dsFlag == False:
dsFlag = True
datasets = dataset_name.split(',')
ds_pairs = defaultdict(list)
for ds in datasets:
try:
objid = subprocess.check_output(['zfs',
'list', '-Hpo', 'objsetid', ds], \
stderr=subprocess.DEVNULL) \
.decode('utf-8').strip()
except subprocess.CalledProcessError as e:
print("Command: \"zfs list -Hpo objset "\
+ str(ds) + "\" failed with error code:"\
+ str(e.returncode))
print("Please make sure that dataset \""\
+ str(ds) + "\" exists")
sys.exit(1)
if not objid:
continue
ds_pairs[ds.split('/')[0]]. \
append(hex(int(objid)))
for pool, objids in ds_pairs.items():
for objid in objids:
kstat_update(pool, objid)
zil_build_dict(pool)
else:
try:
pools = subprocess.check_output(['zpool', 'list', '-Hpo',\
'name']).decode('utf-8').split()
except subprocess.CalledProcessError as e:
print("Command: \"zpool list -Hpo name\" failed with error"\
"code: " + str(e.returncode))
sys.exit(1)
for pool in pools:
kstat_update(pool)
zil_build_dict(pool)
def calculate_diff():
global curr, diff
prev = copy.deepcopy(curr)
zil_process_kstat()
diff = copy.deepcopy(curr)
for pool in curr:
for objset in curr[pool]:
for key in curr[pool][objset]:
if not isinstance(diff[pool][objset][key], int):
continue
# If prev is NULL, this is the
# first time we are here
if not prev:
diff[pool][objset][key] = 0
else:
diff[pool][objset][key] \
= curr[pool][objset][key] \
- prev[pool][objset][key]
def zil_build_dict(pool = "GLOBAL"):
global kstat
for objset in kstat:
for key in kstat[objset]:
val = kstat[objset][key]
if pool not in curr:
curr[pool] = dict()
if objset not in curr[pool]:
curr[pool][objset] = dict()
curr[pool][objset][key] = val
def zil_extend_dict():
global diff
for pool in diff:
for objset in diff[pool]:
diff[pool][objset]["pool"] = pool
diff[pool][objset]["objset"] = objset
diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \
time.localtime())
diff[pool][objset]["icc+inc"] = \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["icb+inb"] = \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["iic+idc"] = \
diff[pool][objset]["zil_itx_indirect_count"] + \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["iib+idb"] = \
diff[pool][objset]["zil_itx_indirect_bytes"] + \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["imnc+imsc"] = \
diff[pool][objset]["zil_itx_metaslab_normal_count"] + \
diff[pool][objset]["zil_itx_metaslab_slog_count"]
diff[pool][objset]["imnb+imsb"] = \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"]
diff[pool][objset]["imnw+imsw"] = \
diff[pool][objset]["zil_itx_metaslab_normal_write"] + \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
diff[pool][objset]["imna+imsa"] = \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
if diff[pool][objset]["imna+imsa"] > 0:
diff[pool][objset]["imb/ima"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imna+imsa"]
else:
diff[pool][objset]["imb/ima"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imna"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"]
else:
diff[pool][objset]["imnb/imna"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsa"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
else:
diff[pool][objset]["imsb/imsa"] = 100
if diff[pool][objset]["imnw+imsw"] > 0:
diff[pool][objset]["imb/imw"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imnw+imsw"]
else:
diff[pool][objset]["imb/imw"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imnw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_write"]
else:
diff[pool][objset]["imnb/imnw"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
else:
diff[pool][objset]["imsb/imsw"] = 100
def sign_handler_epipe(sig, frame):
print("Caught EPIPE signal: " + str(frame))
print("Exitting...")
sys.exit(0)
def main():
global interval
global curr, diff
hprint = False
init()
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, sign_handler_epipe)
zil_process_kstat()
if not curr:
print ("Error: No stats to show")
sys.exit(0)
print_header()
if interval > 0:
time.sleep(interval)
while True:
calculate_diff()
if not diff:
print ("Error: No stats to show")
sys.exit(0)
zil_extend_dict()
print_dict(diff)
time.sleep(interval)
else:
diff = curr
zil_extend_dict()
print_dict(diff)
if __name__ == '__main__':
main()