mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-11-25 20:41:33 +00:00
kernfs: Introduce separate rwsem to protect inode attributes.
Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple
kernfs operations. On a large system with few hundred CPUs and few
hundred applications simultaneoulsy trying to access sysfs, this
results in multiple sys_open(s) contending on kernfs_rwsem via
kernfs_iop_permission and kernfs_dop_revalidate.
For example on a system with 384 cores, if I run 200 instances of an
application which is mostly executing the following loop:
for (int loop = 0; loop <100 ; loop++)
{
for (int port_num = 1; port_num < 2; port_num++)
{
for (int gid_index = 0; gid_index < 254; gid_index++ )
{
char ret_buf[64], ret_buf_lo[64];
char gid_file_path[1024];
int ret_len;
int ret_fd;
ssize_t ret_rd;
ub4 i, saved_errno;
memset(ret_buf, 0, sizeof(ret_buf));
memset(gid_file_path, 0, sizeof(gid_file_path));
ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
"/sys/class/infiniband/%s/ports/%d/gids/%d",
dev_name,
port_num,
gid_index);
ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
if (ret_fd < 0)
{
printf("Failed to open %s\n", gid_file_path);
continue;
}
/* Read the GID */
ret_rd = read(ret_fd, ret_buf, 40);
if (ret_rd == -1)
{
printf("Failed to read from file %s, errno: %u\n",
gid_file_path, saved_errno);
continue;
}
close(ret_fd);
}
}
I see contention around kernfs_rwsem as follows:
path_openat
|
|----link_path_walk.part.0.constprop.0
| |
| |--49.92%--inode_permission
| | |
| | --48.69%--kernfs_iop_permission
| | |
| | |--18.16%--down_read
| | |
| | |--15.38%--up_read
| | |
| | --14.58%--_raw_spin_lock
| | |
| | -----
| |
| |--29.08%--walk_component
| | |
| | --29.02%--lookup_fast
| | |
| | |--24.26%--kernfs_dop_revalidate
| | | |
| | | |--14.97%--down_read
| | | |
| | | --9.01%--up_read
| | |
| | --4.74%--__d_lookup
| | |
| | --4.64%--_raw_spin_lock
| | |
| | ----
Having a separate per-fs rwsem to protect kernfs inode attributes,
will avoid the above mentioned contention and result in better
performance as can bee seen below:
path_openat
|
|----link_path_walk.part.0.constprop.0
| |
| |
| |--27.06%--inode_permission
| | |
| | --25.84%--kernfs_iop_permission
| | |
| | |--9.29%--up_read
| | |
| | |--8.19%--down_read
| | |
| | --7.89%--_raw_spin_lock
| | |
| | ----
| |
| |--22.42%--walk_component
| | |
| | --22.36%--lookup_fast
| | |
| | |--16.07%--__d_lookup
| | | |
| | | --16.01%--_raw_spin_lock
| | | |
| | | ----
| | |
| | --6.28%--kernfs_dop_revalidate
| | |
| | |--3.76%--down_read
| | |
| | --2.26%--up_read
As can be seen from the above data the overhead due to both
kerfs_iop_permission and kernfs_dop_revalidate have gone down and
this also reduces overall run time of the earlier mentioned loop.
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
02fe26f253
commit
9caf696142
@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
|
||||
goto out_unlock;
|
||||
|
||||
/* Update timestamps on the parent */
|
||||
down_write(&root->kernfs_iattr_rwsem);
|
||||
|
||||
ps_iattr = parent->iattr;
|
||||
if (ps_iattr) {
|
||||
ktime_get_real_ts64(&ps_iattr->ia_ctime);
|
||||
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
|
||||
}
|
||||
|
||||
up_write(&root->kernfs_iattr_rwsem);
|
||||
up_write(&root->kernfs_rwsem);
|
||||
|
||||
/*
|
||||
@ -940,6 +943,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
|
||||
|
||||
idr_init(&root->ino_idr);
|
||||
init_rwsem(&root->kernfs_rwsem);
|
||||
init_rwsem(&root->kernfs_iattr_rwsem);
|
||||
INIT_LIST_HEAD(&root->supers);
|
||||
|
||||
/*
|
||||
@ -1462,11 +1466,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
|
||||
pos->parent ? pos->parent->iattr : NULL;
|
||||
|
||||
/* update timestamps on the parent */
|
||||
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
|
||||
|
||||
if (ps_iattr) {
|
||||
ktime_get_real_ts64(&ps_iattr->ia_ctime);
|
||||
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
|
||||
}
|
||||
|
||||
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
|
||||
kernfs_put(pos);
|
||||
}
|
||||
|
||||
|
||||
@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
|
||||
int ret;
|
||||
struct kernfs_root *root = kernfs_root(kn);
|
||||
|
||||
down_write(&root->kernfs_rwsem);
|
||||
down_write(&root->kernfs_iattr_rwsem);
|
||||
ret = __kernfs_setattr(kn, iattr);
|
||||
up_write(&root->kernfs_rwsem);
|
||||
up_write(&root->kernfs_iattr_rwsem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
return -EINVAL;
|
||||
|
||||
root = kernfs_root(kn);
|
||||
down_write(&root->kernfs_rwsem);
|
||||
down_write(&root->kernfs_iattr_rwsem);
|
||||
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
|
||||
if (error)
|
||||
goto out;
|
||||
@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
setattr_copy(&nop_mnt_idmap, inode, iattr);
|
||||
|
||||
out:
|
||||
up_write(&root->kernfs_rwsem);
|
||||
up_write(&root->kernfs_iattr_rwsem);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
|
||||
struct kernfs_node *kn = inode->i_private;
|
||||
struct kernfs_root *root = kernfs_root(kn);
|
||||
|
||||
down_read(&root->kernfs_rwsem);
|
||||
down_read(&root->kernfs_iattr_rwsem);
|
||||
kernfs_refresh_inode(kn, inode);
|
||||
generic_fillattr(&nop_mnt_idmap, inode, stat);
|
||||
up_read(&root->kernfs_rwsem);
|
||||
up_read(&root->kernfs_iattr_rwsem);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
|
||||
kn = inode->i_private;
|
||||
root = kernfs_root(kn);
|
||||
|
||||
down_read(&root->kernfs_rwsem);
|
||||
down_read(&root->kernfs_iattr_rwsem);
|
||||
kernfs_refresh_inode(kn, inode);
|
||||
ret = generic_permission(&nop_mnt_idmap, inode, mask);
|
||||
up_read(&root->kernfs_rwsem);
|
||||
up_read(&root->kernfs_iattr_rwsem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -47,6 +47,7 @@ struct kernfs_root {
|
||||
|
||||
wait_queue_head_t deactivate_waitq;
|
||||
struct rw_semaphore kernfs_rwsem;
|
||||
struct rw_semaphore kernfs_iattr_rwsem;
|
||||
};
|
||||
|
||||
/* +1 to avoid triggering overflow warning when negating it */
|
||||
|
||||
Loading…
Reference in New Issue
Block a user