mirror_ubuntu-kernels/include/uapi/linux/fcntl.h
Christian Brauner 820a185896 fcntl: add F_CREATED_QUERY
Systemd has a helper called openat_report_new() that returns whether a
file was created anew or it already existed before for cases where
O_CREAT has to be used without O_EXCL (cf. [1]). That apparently isn't
something that's specific to systemd but it's where I noticed it.

The current logic is that it first attempts to open the file without
O_CREAT | O_EXCL and if it gets ENOENT the helper tries again with both
flags. If that succeeds all is well. If it now reports EEXIST it
retries.

That works fairly well but some corner cases make this more involved. If
this operates on a dangling symlink the first openat() without O_CREAT |
O_EXCL will return ENOENT but the second openat() with O_CREAT | O_EXCL
will fail with EEXIST. The reason is that openat() without O_CREAT |
O_EXCL follows the symlink while O_CREAT | O_EXCL doesn't for security
reasons. So it's not something we can really change unless we add an
explicit opt-in via O_FOLLOW which seems really ugly.

The caller could try and use fanotify() to register to listen for
creation events in the directory before calling openat(). The caller
could then compare the returned tid to its own tid to ensure that even
in threaded environments it actually created the file. That might work
but is a lot of work for something that should be fairly simple and I'm
uncertain about it's reliability.

The caller could use a bpf lsm hook to hook into security_file_open() to
figure out whether they created the file. That also seems a bit wild.

So let's add F_CREATED_QUERY which allows the caller to check whether
they actually did create the file. That has caveats of course but I
don't think they are problematic:

* In multi-threaded environments a thread can only be sure that it did
  create the file if it calls openat() with O_CREAT. In other words,
  it's obviously not enough to just go through it's fdtable and check
  these fds because another thread could've created the file.

* If there's any codepaths where an openat() with O_CREAT would yield
  the same struct file as that of another thread it would obviously
  cause wrong results. I'm not aware of any such codepaths from openat()
  itself. Imho, that would be a bug.

* Related to the previous point, calling the new fcntl() on files created
  and opened via special-purpose system calls or ioctl()s would cause
  wrong results only if the affected subsystem a) raises FMODE_CREATED
  and b) may return the same struct file for two different calls. I'm
  not seeing anything outside of regular VFS code that raises
  FMODE_CREATED.

  There is code for b) in e.g., the drm layer where the same struct file
  is resurfaced but again FMODE_CREATED isn't used and it would be very
  misleading if it did.

Link: 11d5e2b5fb/src/basic/fs-util.c (L1078) [1]
Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-1-e8153a2f1991@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-08-19 13:44:34 +02:00

129 lines
4.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_FCNTL_H
#define _UAPI_LINUX_FCNTL_H
#include <asm/fcntl.h>
#include <linux/openat2.h>
#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
/*
* Request nofications on a directory.
* See below for events that may be notified.
*/
#define F_NOTIFY (F_LINUX_SPECIFIC_BASE + 2)
#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
/* Was the file just created? */
#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4)
/*
* Cancel a blocking posix lock; internal use only until we expose an
* asynchronous lock api to userspace:
*/
#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5)
/* Create a file descriptor with FD_CLOEXEC set. */
#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6)
/*
* Set and get of pipe page size array
*/
#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
/*
* Set/Get seals
*/
#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
/*
* Types of seals
*/
#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
/* (1U << 31) is reserved for signed error codes */
/*
* Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
* underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
* the specific file.
*/
#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
*/
#define RWH_WRITE_LIFE_NOT_SET 0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT 2
#define RWH_WRITE_LIFE_MEDIUM 3
#define RWH_WRITE_LIFE_LONG 4
#define RWH_WRITE_LIFE_EXTREME 5
/*
* The originally introduced spelling is remained from the first
* versions of the patch set that introduced the feature, see commit
* v4.13-rc1~212^2~51.
*/
#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
#define DN_MODIFY 0x00000002 /* File modified */
#define DN_CREATE 0x00000004 /* File created */
#define DN_DELETE 0x00000008 /* File removed */
#define DN_RENAME 0x00000010 /* File renamed */
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
/*
* The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is
* meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to
* unlinkat. The two functions do completely different things and therefore,
* the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to
* faccessat would be undefined behavior and thus treating it equivalent to
* AT_EACCESS is valid undefined behavior.
*/
#define AT_FDCWD -100 /* Special value used to indicate
openat should use the current
working directory. */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */
#define AT_EACCESS 0x200 /* Test access permitted for
effective IDs, not real IDs. */
#define AT_REMOVEDIR 0x200 /* Remove directory instead of
unlinking file. */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */
#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to
compare object identity and may not
be usable to open_by_handle_at(2) */
#if defined(__KERNEL__)
#define AT_GETATTR_NOSEC 0x80000000
#endif
#endif /* _UAPI_LINUX_FCNTL_H */