Grab the rangelock unconditionally in zfs_getpages()

As a deadlock avoidance measure, zfs_getpages() would only try to
acquire a rangelock, falling back to a single-page read if this was not
possible.  However, this is incompatible with direct I/O.

Instead, release the busy lock before trying to acquire the rangelock in
blocking mode.  This means that it's possible for the page to be
replaced, so we have to re-lookup.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Mark Johnston <markj@FreeBSD.org>
Closes #16643
This commit is contained in:
Mark Johnston 2024-10-13 16:17:08 +00:00 committed by Brian Behlendorf
parent 7313c6e382
commit 37e8f3ae17

View File

@ -3930,6 +3930,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
return (zfs_vm_pagerret_error);
object = ma[0]->object;
start = IDX_TO_OFF(ma[0]->pindex);
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
@ -3938,33 +3939,47 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
* Note that we need to handle the case of the block size growing.
*/
for (;;) {
uint64_t len;
blksz = zp->z_blksz;
len = roundup(end, blksz) - rounddown(start, blksz);
lr = zfs_rangelock_tryenter(&zp->z_rangelock,
rounddown(start, blksz),
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
rounddown(start, blksz), len, RL_READER);
if (lr == NULL) {
if (rahead != NULL) {
*rahead = 0;
rahead = NULL;
/*
* Avoid a deadlock with update_pages(). We need to
* hold the range lock when copying from the DMU, so
* give up the busy lock to allow update_pages() to
* proceed. We might need to allocate new pages, which
* isn't quite right since this allocation isn't subject
* to the page fault handler's OOM logic, but this is
* the best we can do for now.
*/
for (int i = 0; i < count; i++) {
ASSERT(vm_page_none_valid(ma[i]));
vm_page_xunbusy(ma[i]);
}
if (rbehind != NULL) {
*rbehind = 0;
rbehind = NULL;
}
break;
lr = zfs_rangelock_enter(&zp->z_rangelock,
rounddown(start, blksz), len, RL_READER);
zfs_vmobject_wlock(object);
(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
ma, count);
zfs_vmobject_wunlock(object);
}
if (blksz == zp->z_blksz)
break;
zfs_rangelock_exit(lr);
}
object = ma[0]->object;
zfs_vmobject_wlock(object);
obj_size = object->un_pager.vnp.vnp_size;
zfs_vmobject_wunlock(object);
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
if (lr != NULL)
zfs_rangelock_exit(lr);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (zfs_vm_pagerret_bad);
}
@ -3989,11 +4004,30 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
* ZFS will panic if we request DMU to read beyond the end of the last
* allocated block.
*/
error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
&pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
for (int i = 0; i < count; i++) {
int count1, j, last_size;
if (lr != NULL)
zfs_rangelock_exit(lr);
if (vm_page_any_valid(ma[i])) {
ASSERT(vm_page_all_valid(ma[i]));
continue;
}
for (j = i + 1; j < count; j++) {
if (vm_page_any_valid(ma[j])) {
ASSERT(vm_page_all_valid(ma[j]));
break;
}
}
count1 = j - i;
last_size = j == count ?
MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
i == 0 ? &pgsin_b : NULL, j == count ? &pgsin_a : NULL,
last_size);
if (error != 0)
break;
}
zfs_rangelock_exit(lr);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);