mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-12-03 12:52:48 +00:00
MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
75 lines
2.5 KiB
C
75 lines
2.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
/*
|
|
* NUMA memory policies for Linux.
|
|
* Copyright 2003,2004 Andi Kleen SuSE Labs
|
|
*/
|
|
#ifndef _UAPI_LINUX_MEMPOLICY_H
|
|
#define _UAPI_LINUX_MEMPOLICY_H
|
|
|
|
#include <linux/errno.h>
|
|
|
|
|
|
/*
|
|
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
|
|
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
|
|
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
|
|
*/
|
|
|
|
/* Policies */
|
|
enum {
|
|
MPOL_DEFAULT,
|
|
MPOL_PREFERRED,
|
|
MPOL_BIND,
|
|
MPOL_INTERLEAVE,
|
|
MPOL_LOCAL,
|
|
MPOL_MAX, /* always last member of enum */
|
|
};
|
|
|
|
/* Flags for set_mempolicy */
|
|
#define MPOL_F_STATIC_NODES (1 << 15)
|
|
#define MPOL_F_RELATIVE_NODES (1 << 14)
|
|
#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */
|
|
|
|
/*
|
|
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
|
|
* either set_mempolicy() or mbind().
|
|
*/
|
|
#define MPOL_MODE_FLAGS \
|
|
(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING)
|
|
|
|
/* Flags for get_mempolicy */
|
|
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
|
|
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
|
|
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
|
|
|
|
/* Flags for mbind */
|
|
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
|
|
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
|
|
to policy */
|
|
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
|
|
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
|
|
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
|
|
|
|
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
|
|
MPOL_MF_MOVE | \
|
|
MPOL_MF_MOVE_ALL)
|
|
|
|
/*
|
|
* Internal flags that share the struct mempolicy flags word with
|
|
* "mode flags". These flags are allocated from bit 0 up, as they
|
|
* are never OR'ed into the mode in mempolicy API arguments.
|
|
*/
|
|
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
|
|
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
|
|
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
|
|
|
|
/*
|
|
* These bit locations are exposed in the vm.zone_reclaim_mode sysctl
|
|
* ABI. New bits are OK, but existing bits can never change.
|
|
*/
|
|
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
|
|
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
|
|
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
|
|
|
|
#endif /* _UAPI_LINUX_MEMPOLICY_H */
|