mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-31 14:13:39 +00:00

Implement a new lightweight guard page feature, that is regions of userland virtual memory that, when accessed, cause a fatal signal to arise. Currently users must establish PROT_NONE ranges to achieve this. However this is very costly memory-wise - we need a VMA for each and every one of these regions AND they become unmergeable with surrounding VMAs. In addition repeated mmap() calls require repeated kernel context switches and contention of the mmap lock to install these ranges, potentially also having to unmap memory if installed over existing ranges. The lightweight guard approach eliminates the VMA cost altogether - rather than establishing a PROT_NONE VMA, it operates at the level of page table entries - establishing PTE markers such that accesses to them cause a fault followed by a SIGSGEV signal being raised. This is achieved through the PTE marker mechanism, which we have already extended to provide PTE_MARKER_GUARD, which we installed via the generic page walking logic which we have extended for this purpose. These guard ranges are established with MADV_GUARD_INSTALL. If the range in which they are installed contain any existing mappings, they will be zapped, i.e. free the range and unmap memory (thus mimicking the behaviour of MADV_DONTNEED in this respect). Any existing guard entries will be left untouched. There is therefore no nesting of guarded pages. Guarded ranges are NOT cleared by MADV_DONTNEED nor MADV_FREE (in both instances the memory range may be reused at which point a user would expect guards to still be in place), but they are cleared via MADV_GUARD_REMOVE, process teardown or unmapping of memory ranges. The guard property can be removed from ranges via MADV_GUARD_REMOVE. The ranges over which this is applied, should they contain non-guard entries, will be untouched, with only guard entries being cleared. We permit this operation on anonymous memory only, and only VMAs which are non-special, non-huge and not mlock()'d (if we permitted this we'd have to drop locked pages which would be rather counterintuitive). Racing page faults can cause repeated attempts to install guard pages that are interrupted, result in a zap, and this process can end up being repeated. If this happens more than would be expected in normal operation, we rescind locks and retry the whole thing, which avoids lock contention in this scenario. Link: https://lkml.kernel.org/r/6aafb5821bf209f277dfae0787abb2ef87a37542.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Suggested-by: Vlastimil Babka <vbabka@suse.cz> Suggested-by: Jann Horn <jannh@google.com> Suggested-by: David Hildenbrand <david@redhat.com> Suggested-by: Vlastimil Babka <vbabka@suse.cz> Suggested-by: Jann Horn <jannh@google.com> Suggested-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Arnd Bergmann <arnd@kernel.org> Cc: Christian Brauner <brauner@kernel.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: Chris Zankel <chris@zankel.net> Cc: Helge Deller <deller@gmx.de> Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com> Cc: Jeff Xu <jeffxu@chromium.org> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Matt Turner <mattst88@gmail.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Richard Henderson <richard.henderson@linaro.org> Cc: Shuah Khan <shuah@kernel.org> Cc: Shuah Khan <skhan@linuxfoundation.org> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
120 lines
4.7 KiB
C
120 lines
4.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
/*
|
|
* This file is subject to the terms and conditions of the GNU General Public
|
|
* License. See the file "COPYING" in the main directory of this archive
|
|
* for more details.
|
|
*
|
|
* Copyright (C) 1995, 1999, 2002 by Ralf Baechle
|
|
*/
|
|
#ifndef _ASM_MMAN_H
|
|
#define _ASM_MMAN_H
|
|
|
|
/*
|
|
* Protections are chosen from these bits, OR'd together. The
|
|
* implementation does not necessarily support PROT_EXEC or PROT_WRITE
|
|
* without PROT_READ. The only guarantees are that no writing will be
|
|
* allowed without PROT_WRITE and no access will be allowed for PROT_NONE.
|
|
*/
|
|
#define PROT_NONE 0x00 /* page can not be accessed */
|
|
#define PROT_READ 0x01 /* page can be read */
|
|
#define PROT_WRITE 0x02 /* page can be written */
|
|
#define PROT_EXEC 0x04 /* page can be executed */
|
|
/* 0x08 reserved for PROT_EXEC_NOFLUSH */
|
|
#define PROT_SEM 0x10 /* page may be used for atomic ops */
|
|
#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
|
|
#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
|
|
|
|
/*
|
|
* Flags for mmap
|
|
*/
|
|
/* 0x01 - 0x03 are defined in linux/mman.h */
|
|
#define MAP_TYPE 0x00f /* Mask for type of mapping */
|
|
#define MAP_FIXED 0x010 /* Interpret addr exactly */
|
|
|
|
/* not used by linux, but here to make sure we don't clash with ABI defines */
|
|
#define MAP_RENAME 0x020 /* Assign page to file */
|
|
#define MAP_AUTOGROW 0x040 /* File may grow by writing */
|
|
#define MAP_LOCAL 0x080 /* Copy on fork/sproc */
|
|
#define MAP_AUTORSRV 0x100 /* Logical swap reserved on demand */
|
|
|
|
/* These are linux-specific */
|
|
#define MAP_NORESERVE 0x0400 /* don't check for reservations */
|
|
#define MAP_ANONYMOUS 0x0800 /* don't use a file */
|
|
#define MAP_GROWSDOWN 0x1000 /* stack-like segment */
|
|
#define MAP_DENYWRITE 0x2000 /* ETXTBSY */
|
|
#define MAP_EXECUTABLE 0x4000 /* mark it as an executable */
|
|
#define MAP_LOCKED 0x8000 /* pages are locked */
|
|
#define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */
|
|
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
|
|
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
|
|
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
|
|
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
|
|
|
|
/*
|
|
* Flags for msync
|
|
*/
|
|
#define MS_ASYNC 0x0001 /* sync memory asynchronously */
|
|
#define MS_INVALIDATE 0x0002 /* invalidate mappings & caches */
|
|
#define MS_SYNC 0x0004 /* synchronous memory sync */
|
|
|
|
/*
|
|
* Flags for mlockall
|
|
*/
|
|
#define MCL_CURRENT 1 /* lock all current mappings */
|
|
#define MCL_FUTURE 2 /* lock all future mappings */
|
|
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
|
|
|
/*
|
|
* Flags for mlock
|
|
*/
|
|
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
|
|
|
#define MADV_NORMAL 0 /* no further special treatment */
|
|
#define MADV_RANDOM 1 /* expect random page references */
|
|
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
|
|
#define MADV_WILLNEED 3 /* will need these pages */
|
|
#define MADV_DONTNEED 4 /* don't need these pages */
|
|
|
|
/* common parameters: try to keep these consistent across architectures */
|
|
#define MADV_FREE 8 /* free pages only if memory pressure */
|
|
#define MADV_REMOVE 9 /* remove these pages & resources */
|
|
#define MADV_DONTFORK 10 /* don't inherit across fork */
|
|
#define MADV_DOFORK 11 /* do inherit across fork */
|
|
|
|
#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
|
|
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
|
|
#define MADV_HWPOISON 100 /* poison a page for testing */
|
|
|
|
#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
|
|
#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
|
|
|
|
#define MADV_DONTDUMP 16 /* Explicitly exclude from core dump,
|
|
overrides the coredump filter bits */
|
|
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
|
|
|
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
|
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
|
|
|
#define MADV_COLD 20 /* deactivate these pages */
|
|
#define MADV_PAGEOUT 21 /* reclaim these pages */
|
|
|
|
#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
|
|
#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
|
|
|
|
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
|
|
|
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
|
|
|
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
|
|
#define MADV_GUARD_REMOVE 103 /* unguard range */
|
|
|
|
/* compatibility flags */
|
|
#define MAP_FILE 0
|
|
|
|
#define PKEY_DISABLE_ACCESS 0x1
|
|
#define PKEY_DISABLE_WRITE 0x2
|
|
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
|
PKEY_DISABLE_WRITE)
|
|
|
|
#endif /* _ASM_MMAN_H */
|