mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-30 21:52:21 +00:00

Patch series "mm: reliable huge page allocator". This series makes changes to the allocator and reclaim/compaction code to try harder to avoid fragmentation. As a result, this makes huge page allocations cheaper, more reliable and more sustainable. It's a subset of the huge page allocator RFC initially proposed here: https://lore.kernel.org/lkml/20230418191313.268131-1-hannes@cmpxchg.org/ The following results are from a kernel build test, with additional concurrent bursts of THP allocations on a memory-constrained system. Comparing before and after the changes over 15 runs: before after Hugealloc Time mean 52739.45 ( +0.00%) 28904.00 ( -45.19%) Hugealloc Time stddev 56541.26 ( +0.00%) 33464.37 ( -40.81%) Kbuild Real time 197.47 ( +0.00%) 196.59 ( -0.44%) Kbuild User time 1240.49 ( +0.00%) 1231.67 ( -0.71%) Kbuild System time 70.08 ( +0.00%) 59.10 ( -15.45%) THP fault alloc 46727.07 ( +0.00%) 63223.67 ( +35.30%) THP fault fallback 21910.60 ( +0.00%) 5412.47 ( -75.29%) Direct compact fail 195.80 ( +0.00%) 59.07 ( -69.48%) Direct compact success 7.93 ( +0.00%) 2.80 ( -57.46%) Direct compact success rate % 3.51 ( +0.00%) 3.99 ( +10.49%) Compact daemon scanned migrate 3369601.27 ( +0.00%) 2267500.33 ( -32.71%) Compact daemon scanned free 5075474.47 ( +0.00%) 2339773.00 ( -53.90%) Compact direct scanned migrate 161787.27 ( +0.00%) 47659.93 ( -70.54%) Compact direct scanned free 163467.53 ( +0.00%) 40729.67 ( -75.08%) Compact total migrate scanned 3531388.53 ( +0.00%) 2315160.27 ( -34.44%) Compact total free scanned 5238942.00 ( +0.00%) 2380502.67 ( -54.56%) Alloc stall 2371.07 ( +0.00%) 638.87 ( -73.02%) Pages kswapd scanned 2160926.73 ( +0.00%) 4002186.33 ( +85.21%) Pages kswapd reclaimed 533191.07 ( +0.00%) 718577.80 ( +34.77%) Pages direct scanned 400450.33 ( +0.00%) 355172.73 ( -11.31%) Pages direct reclaimed 94441.73 ( +0.00%) 31162.80 ( -67.00%) Pages total scanned 2561377.07 ( +0.00%) 4357359.07 ( +70.12%) Pages total reclaimed 627632.80 ( +0.00%) 749740.60 ( +19.46%) Swap out 47959.53 ( +0.00%) 110084.33 ( +129.53%) Swap in 7276.00 ( +0.00%) 24457.00 ( +236.10%) File refaults 138043.00 ( +0.00%) 188226.93 ( +36.35%) THP latencies are cut in half, and failure rates are cut by 75%. These metrics also hold up over time, while the vanilla kernel sees a steady downward trend in success rates with each subsequent run, owed to the cumulative effects of fragmentation. A more detailed discussion of results is in the patch changelogs. The patches first introduce a vm.defrag_mode sysctl, which enforces the existing ALLOC_NOFRAGMENT alloc flag until after reclaim and compaction have run. They then change kswapd and kcompactd to target pageblocks, which boosts success in the ALLOC_NOFRAGMENT hotpaths. Patches #1 and #2 are somewhat unrelated cleanups, but touch the same code and so are included here to avoid conflicts from re-ordering. This patch (of 5): compaction_suitable() hardcodes the min watermark, with a boost to the low watermark for costly orders. However, compaction_ready() requires order-0 at the high watermark. It currently checks the marks twice. Make the watermark a parameter to compaction_suitable() and have the callers pass in what they require: - compaction_zonelist_suitable() is used by the direct reclaim path, so use the min watermark. - compact_suit_allocation_order() has a watermark in context derived from cc->alloc_flags. The only quirk is that kcompactd doesn't initialize cc->alloc_flags explicitly. There is a direct check in kcompactd_do_work() that passes ALLOC_WMARK_MIN, but there is another check downstack in compact_zone() that ends up passing the unset alloc_flags. Since they default to 0, and that coincides with ALLOC_WMARK_MIN, it is correct. But it's subtle. Set cc->alloc_flags explicitly. - should_continue_reclaim() is direct reclaim, use the min watermark. - Finally, consolidate the two checks in compaction_ready() to a single compaction_suitable() call passing the high watermark. There is a tiny change in behavior: before, compaction_suitable() would check order-0 against min or low, depending on costly order. Then there'd be another high watermark check. Now, the high watermark is passed to compaction_suitable(), and the costly order-boost (low - min) is added on top. This means compaction_ready() sets a marginally higher target for free pages. In a kernelbuild + THP pressure test, though, this didn't show any measurable negative effects on memory pressure or reclaim rates. As the comment above the check says, reclaim is usually stopped short on should_continue_reclaim(), and this just defines the worst-case reclaim cutoff in case compaction is not making any headway. [hughd@google.com: stop oops on out-of-range highest_zoneidx] Link: https://lkml.kernel.org/r/005ace8b-07fa-01d4-b54b-394a3e029c07@google.com Link: https://lkml.kernel.org/r/20250313210647.1314586-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20250313210647.1314586-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Zi Yan <ziy@nvidia.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
154 lines
4.6 KiB
C
154 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_COMPACTION_H
|
|
#define _LINUX_COMPACTION_H
|
|
|
|
/*
|
|
* Determines how hard direct compaction should try to succeed.
|
|
* Lower value means higher priority, analogically to reclaim priority.
|
|
*/
|
|
enum compact_priority {
|
|
COMPACT_PRIO_SYNC_FULL,
|
|
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
|
|
COMPACT_PRIO_SYNC_LIGHT,
|
|
MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
COMPACT_PRIO_ASYNC,
|
|
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
|
|
};
|
|
|
|
/* Return values for compact_zone() and try_to_compact_pages() */
|
|
/* When adding new states, please adjust include/trace/events/compaction.h */
|
|
enum compact_result {
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
COMPACT_NOT_SUITABLE_ZONE,
|
|
/*
|
|
* compaction didn't start as it was not possible or direct reclaim
|
|
* was more suitable
|
|
*/
|
|
COMPACT_SKIPPED,
|
|
/* compaction didn't start as it was deferred due to past failures */
|
|
COMPACT_DEFERRED,
|
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
COMPACT_NO_SUITABLE_PAGE,
|
|
/* compaction should continue to another pageblock */
|
|
COMPACT_CONTINUE,
|
|
|
|
/*
|
|
* The full zone was compacted scanned but wasn't successful to compact
|
|
* suitable pages.
|
|
*/
|
|
COMPACT_COMPLETE,
|
|
/*
|
|
* direct compaction has scanned part of the zone but wasn't successful
|
|
* to compact suitable pages.
|
|
*/
|
|
COMPACT_PARTIAL_SKIPPED,
|
|
|
|
/* compaction terminated prematurely due to lock contentions */
|
|
COMPACT_CONTENDED,
|
|
|
|
/*
|
|
* direct compaction terminated after concluding that the allocation
|
|
* should now succeed
|
|
*/
|
|
COMPACT_SUCCESS,
|
|
};
|
|
|
|
struct alloc_context; /* in mm/internal.h */
|
|
|
|
/*
|
|
* Number of free order-0 pages that should be available above given watermark
|
|
* to make sure compaction has reasonable chance of not running out of free
|
|
* pages that it needs to isolate as migration target during its work.
|
|
*/
|
|
static inline unsigned long compact_gap(unsigned int order)
|
|
{
|
|
/*
|
|
* Although all the isolations for migration are temporary, compaction
|
|
* free scanner may have up to 1 << order pages on its list and then
|
|
* try to split an (order - 1) free page. At that point, a gap of
|
|
* 1 << order might not be enough, so it's safer to require twice that
|
|
* amount. Note that the number of pages on the list is also
|
|
* effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
|
|
* that the migrate scanner can have isolated on migrate list, and free
|
|
* scanner is only invoked when the number of isolated free pages is
|
|
* lower than that. But it's not worth to complicate the formula here
|
|
* as a bigger gap for higher orders than strictly necessary can also
|
|
* improve chances of compaction success.
|
|
*/
|
|
return 2UL << order;
|
|
}
|
|
|
|
static inline int current_is_kcompactd(void)
|
|
{
|
|
return current->flags & PF_KCOMPACTD;
|
|
}
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
|
|
extern int fragmentation_index(struct zone *zone, unsigned int order);
|
|
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
|
|
unsigned int order, unsigned int alloc_flags,
|
|
const struct alloc_context *ac, enum compact_priority prio,
|
|
struct page **page);
|
|
extern void reset_isolation_suitable(pg_data_t *pgdat);
|
|
extern bool compaction_suitable(struct zone *zone, int order,
|
|
unsigned long watermark, int highest_zoneidx);
|
|
|
|
extern void compaction_defer_reset(struct zone *zone, int order,
|
|
bool alloc_success);
|
|
|
|
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
|
|
int alloc_flags);
|
|
|
|
extern void __meminit kcompactd_run(int nid);
|
|
extern void __meminit kcompactd_stop(int nid);
|
|
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
|
|
|
|
#else
|
|
static inline void reset_isolation_suitable(pg_data_t *pgdat)
|
|
{
|
|
}
|
|
|
|
static inline bool compaction_suitable(struct zone *zone, int order,
|
|
unsigned long watermark,
|
|
int highest_zoneidx)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void kcompactd_run(int nid)
|
|
{
|
|
}
|
|
static inline void kcompactd_stop(int nid)
|
|
{
|
|
}
|
|
|
|
static inline void wakeup_kcompactd(pg_data_t *pgdat,
|
|
int order, int highest_zoneidx)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_COMPACTION */
|
|
|
|
struct node;
|
|
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
|
|
extern int compaction_register_node(struct node *node);
|
|
extern void compaction_unregister_node(struct node *node);
|
|
|
|
#else
|
|
|
|
static inline int compaction_register_node(struct node *node)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void compaction_unregister_node(struct node *node)
|
|
{
|
|
}
|
|
#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
|
|
|
|
#endif /* _LINUX_COMPACTION_H */
|