FreeBSD: zfs_putpages: don't undirty pages until after write completes

In syncing mode, zfs_putpages() would put the entire range of pages onto
the ZIL, then return VM_PAGER_OK for each page to the kernel. However,
an associated zil_commit() or txg sync had not happened at this point,
so the write may not actually be on disk.

So, we rework that case to use a ZIL commit callback, and do the
post-write work of undirtying the page and signaling completion there.
We return VM_PAGER_PEND to the kernel instead so it knows that we will
take care of it.

The original version of this (238eab7dc1) copied the Linux model and did
the cleanup in a ZIL callback for both async and sync. This was a
mistake, as FreeBSD does not have a separate "busy for writeback" flag
like Linux which keeps the page usable. The full sbusy flag locks the
entire page out until the itx callback fires, which for async is after
txg sync, which could be literal seconds in the future.

For the async case, the data is already on the DMU and the in-memory
ZIL, which is sufficient for async writeback, so the old method of
logging it without a callback, undirtying the page and returning is more
than sufficient and reclaims that lost performance.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Johnston <markj@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17533
This commit is contained in:
Rob Norris 2025-07-01 09:24:23 +10:00 committed by Brian Behlendorf
parent ee2a2d941a
commit d323fbf49c
3 changed files with 75 additions and 14 deletions

View File

@ -35,6 +35,7 @@
extern const int zfs_vm_pagerret_bad;
extern const int zfs_vm_pagerret_error;
extern const int zfs_vm_pagerret_ok;
extern const int zfs_vm_pagerret_pend;
extern const int zfs_vm_pagerput_sync;
extern const int zfs_vm_pagerput_inval;

View File

@ -43,6 +43,7 @@
const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
const int zfs_vm_pagerret_ok = VM_PAGER_OK;
const int zfs_vm_pagerret_pend = VM_PAGER_PEND;
const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;

View File

@ -25,6 +25,7 @@
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2025, Klara, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
@ -4299,6 +4300,33 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap)
ap->a_rahead));
}
typedef struct {
uint_t pca_npages;
vm_page_t pca_pages[];
} putpage_commit_arg_t;
static void
zfs_putpage_commit_cb(void *arg)
{
putpage_commit_arg_t *pca = arg;
vm_object_t object = pca->pca_pages[0]->object;
zfs_vmobject_wlock(object);
for (uint_t i = 0; i < pca->pca_npages; i++) {
vm_page_t pp = pca->pca_pages[i];
vm_page_undirty(pp);
vm_page_sunbusy(pp);
}
vm_object_pip_wakeupn(object, pca->pca_npages);
zfs_vmobject_wunlock(object);
kmem_free(pca,
offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
}
static int
zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
int *rtvals)
@ -4400,10 +4428,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
}
if (zp->z_blksz < PAGE_SIZE) {
for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
vm_ooffset_t woff = off;
size_t wlen = len;
for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
tocopy = MIN(PAGE_SIZE, wlen);
va = zfs_map_page(ma[i], &sf);
dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
zfs_unmap_page(sf);
}
} else {
@ -4424,19 +4454,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT0(err);
/*
* XXX we should be passing a callback to undirty
* but that would make the locking messier
*/
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
len, commit, B_FALSE, NULL, NULL);
zfs_vmobject_wlock(object);
for (i = 0; i < ncount; i++) {
rtvals[i] = zfs_vm_pagerret_ok;
vm_page_undirty(ma[i]);
if (commit) {
/*
* Caller requested that we commit immediately. We set
* a callback on the log entry, to be called once its
* on disk after the call to zil_commit() below. The
* pages will be undirtied and unbusied there.
*/
putpage_commit_arg_t *pca = kmem_alloc(
offsetof(putpage_commit_arg_t, pca_pages[ncount]),
KM_SLEEP);
pca->pca_npages = ncount;
memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
for (i = 0; i < ncount; i++)
rtvals[i] = zfs_vm_pagerret_pend;
} else {
/*
* Caller just wants the page written back somewhere,
* but doesn't need it committed yet. We've already
* written it back to the DMU, so we just need to put
* it on the async log, then undirty the page and
* return.
*
* We cannot use a callback here, because it would keep
* the page busy (locked) until it is eventually
* written down at txg sync.
*/
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
B_FALSE, B_FALSE, NULL, NULL);
zfs_vmobject_wlock(object);
for (i = 0; i < ncount; i++) {
rtvals[i] = zfs_vm_pagerret_ok;
vm_page_undirty(ma[i]);
}
zfs_vmobject_wunlock(object);
}
zfs_vmobject_wunlock(object);
VM_CNT_INC(v_vnodeout);
VM_CNT_ADD(v_vnodepgsout, ncount);
}