mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-27 06:50:37 +00:00

Currently, zsmalloc, zswap's and zram's backend memory allocator, does not enforce any policy for the allocation of memory for the compressed data, instead just adopting the memory policy of the task entering reclaim, or the default policy (prefer local node) if no such policy is specified. This can lead to several pathological behaviors in multi-node NUMA systems: 1. Systems with CXL-based memory tiering can encounter the following inversion with zswap/zram: the coldest pages demoted to the CXL tier can return to the high tier when they are reclaimed to compressed swap, creating memory pressure on the high tier. 2. Consider a direct reclaimer scanning nodes in order of allocation preference. If it ventures into remote nodes, the memory it compresses there should stay there. Trying to shift those contents over to the reclaiming thread's preferred node further *increases* its local pressure, and provoking more spills. The remote node is also the most likely to refault this data again. This undesirable behavior was pointed out by Johannes Weiner in [1]. 3. For zswap writeback, the zswap entries are organized in node-specific LRUs, based on the node placement of the original pages, allowing for targeted zswap writeback for specific nodes. However, the compressed data of a zswap entry can be placed on a different node from the LRU it is placed on. This means that reclaim targeted at one node might not free up memory used for zswap entries in that node, but instead reclaiming memory in a different node. All of these issues will be resolved if the compressed data go to the same node as the original page. This patch encourages this behavior by having zswap and zram pass the node of the original page to zsmalloc, and have zsmalloc prefer the specified node if we need to allocate new (zs)pages for the compressed data. Note that we are not strictly binding the allocation to the preferred node. We still allow the allocation to fall back to other nodes when the preferred node is full, or if we have zspages with slots available on a different node. This is OK, and still a strict improvement over the status quo: 1. On a system with demotion enabled, we will generally prefer demotions over compressed swapping, and only swap when pages have already gone to the lowest tier. This patch should achieve the desired effect for the most part. 2. If the preferred node is out of memory, letting the compressed data going to other nodes can be better than the alternative (OOMs, keeping cold memory unreclaimed, disk swapping, etc.). 3. If the allocation go to a separate node because we have a zspage with slots available, at least we're not creating extra immediate memory pressure (since the space is already allocated). 3. While there can be mixings, we generally reclaim pages in same-node batches, which encourage zspage grouping that is more likely to go to the right node. 4. A strict binding would require partitioning zsmalloc by node, which is more complicated, and more prone to regression, since it reduces the storage density of zsmalloc. We need to evaluate the tradeoff and benchmark carefully before adopting such an involved solution. [1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/ [senozhatsky@chromium.org: coding-style fixes] Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com Signed-off-by: Nhat Pham <nphamcs@gmail.com> Suggested-by: Gregory Price <gourry@gourry.net> Acked-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org> [zram, zsmalloc] Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev> [zswap/zsmalloc] Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com> Cc: Minchan Kim <minchan@kernel.org> Cc: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
87 lines
2.2 KiB
C
87 lines
2.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* zpool memory storage api
|
|
*
|
|
* Copyright (C) 2014 Dan Streetman
|
|
*
|
|
* This is a common frontend for the zswap compressed memory storage
|
|
* implementations.
|
|
*/
|
|
|
|
#ifndef _ZPOOL_H_
|
|
#define _ZPOOL_H_
|
|
|
|
struct zpool;
|
|
|
|
bool zpool_has_pool(char *type);
|
|
|
|
struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp);
|
|
|
|
const char *zpool_get_type(struct zpool *pool);
|
|
|
|
void zpool_destroy_pool(struct zpool *pool);
|
|
|
|
int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
|
|
unsigned long *handle, const int nid);
|
|
|
|
void zpool_free(struct zpool *pool, unsigned long handle);
|
|
|
|
void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
|
|
void *local_copy);
|
|
|
|
void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
|
|
void *handle_mem);
|
|
|
|
void zpool_obj_write(struct zpool *zpool, unsigned long handle,
|
|
void *handle_mem, size_t mem_len);
|
|
|
|
u64 zpool_get_total_pages(struct zpool *pool);
|
|
|
|
|
|
/**
|
|
* struct zpool_driver - driver implementation for zpool
|
|
* @type: name of the driver.
|
|
* @list: entry in the list of zpool drivers.
|
|
* @create: create a new pool.
|
|
* @destroy: destroy a pool.
|
|
* @malloc: allocate mem from a pool.
|
|
* @free: free mem from a pool.
|
|
* @sleep_mapped: whether zpool driver can sleep during map.
|
|
* @map: map a handle.
|
|
* @unmap: unmap a handle.
|
|
* @total_size: get total size of a pool.
|
|
*
|
|
* This is created by a zpool implementation and registered
|
|
* with zpool.
|
|
*/
|
|
struct zpool_driver {
|
|
char *type;
|
|
struct module *owner;
|
|
atomic_t refcount;
|
|
struct list_head list;
|
|
|
|
void *(*create)(const char *name, gfp_t gfp);
|
|
void (*destroy)(void *pool);
|
|
|
|
int (*malloc)(void *pool, size_t size, gfp_t gfp,
|
|
unsigned long *handle, const int nid);
|
|
void (*free)(void *pool, unsigned long handle);
|
|
|
|
void *(*obj_read_begin)(void *pool, unsigned long handle,
|
|
void *local_copy);
|
|
void (*obj_read_end)(void *pool, unsigned long handle,
|
|
void *handle_mem);
|
|
void (*obj_write)(void *pool, unsigned long handle,
|
|
void *handle_mem, size_t mem_len);
|
|
|
|
u64 (*total_pages)(void *pool);
|
|
};
|
|
|
|
void zpool_register_driver(struct zpool_driver *driver);
|
|
|
|
int zpool_unregister_driver(struct zpool_driver *driver);
|
|
|
|
bool zpool_can_sleep_mapped(struct zpool *pool);
|
|
|
|
#endif
|