mirror_ubuntu-kernels/include/linux/dax.h
Dan Williams 0aed55af88 x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations
The pmem driver has a need to transfer data with a persistent memory
destination and be able to rely on the fact that the destination writes are not
cached. It is sufficient for the writes to be flushed to a cpu-store-buffer
(non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync()
to ensure data-writes have reached a power-fail-safe zone in the platform. The
fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn
around and fence previous writes with an "sfence".

Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and
memcpy_flushcache, that guarantee that the destination buffer is not dirty in
the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines
will be used to replace the "pmem api" (include/linux/pmem.h +
arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache()
and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
config symbol, and fallback to copy_from_iter_nocache() and plain memcpy()
otherwise.

This is meant to satisfy the concern from Linus that if a driver wants to do
something beyond the normal nocache semantics it should be something private to
that driver [1], and Al's concern that anything uaccess related belongs with
the rest of the uaccess code [2].

The first consumer of this interface is a new 'copy_from_iter' dax operation so
that pmem can inject cache maintenance operations without imposing this
overhead on other dax-capable drivers.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html

Cc: <x86@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2017-06-09 09:09:56 -07:00

162 lines
4.6 KiB
C

#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>
#include <asm/pgtable.h>
struct iomap_ops;
struct dax_device;
struct dax_operations {
/*
* direct_access: translate a device-relative
* logical-page-offset into an absolute physical pfn. Return the
* number of pages available for DAX at that pfn.
*/
long (*direct_access)(struct dax_device *, pgoff_t, long,
void **, pfn_t *);
/* copy_from_iter: dax-driver override for default copy_from_iter */
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *);
};
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *dax_get_by_host(const char *host);
void put_dax(struct dax_device *dax_dev);
#else
static inline struct dax_device *dax_get_by_host(const char *host)
{
return NULL;
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
#endif
int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
#if IS_ENABLED(CONFIG_FS_DAX)
int __bdev_dax_supported(struct super_block *sb, int blocksize);
static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
{
return __bdev_dax_supported(sb, blocksize);
}
static inline struct dax_device *fs_dax_get_by_host(const char *host)
{
return dax_get_by_host(host);
}
static inline void fs_put_dax(struct dax_device *dax_dev)
{
put_dax(dax_dev);
}
#else
static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
{
return -EOPNOTSUPP;
}
static inline struct dax_device *fs_dax_get_by_host(const char *host)
{
return NULL;
}
static inline void fs_put_dax(struct dax_device *dax_dev)
{
}
#endif
int dax_read_lock(void);
void dax_read_unlock(int id);
struct dax_device *alloc_dax(void *private, const char *host,
const struct dax_operations *ops);
bool dax_alive(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
void **kaddr, pfn_t *pfn);
/*
* We use lowest available bit in exceptional entry for locking, one bit for
* the entry size (PMD) and two more to tell us if the entry is a huge zero
* page (HZP) or an empty entry that is just used for locking. In total four
* special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
* EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
static inline unsigned long dax_radix_sector(void *entry)
{
return (unsigned long)entry >> RADIX_DAX_SHIFT;
}
static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
{
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
((unsigned long)sector << RADIX_DAX_SHIFT) |
RADIX_DAX_ENTRY_LOCK);
}
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all);
#ifdef CONFIG_FS_DAX
int __dax_zero_page_range(struct block_device *bdev,
struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length);
#else
static inline int __dax_zero_page_range(struct block_device *bdev,
struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length)
{
return -ENXIO;
}
#endif
#ifdef CONFIG_FS_DAX_PMD
static inline unsigned int dax_radix_order(void *entry)
{
if ((unsigned long)entry & RADIX_DAX_PMD)
return PMD_SHIFT - PAGE_SHIFT;
return 0;
}
#else
static inline unsigned int dax_radix_order(void *entry)
{
return 0;
}
#endif
int dax_pfn_mkwrite(struct vm_fault *vmf);
static inline bool vma_is_dax(struct vm_area_struct *vma)
{
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
}
struct writeback_control;
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc);
#endif