mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-28 09:22:08 +00:00

When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf <graf@amazon.com> Acked-by: Baoquan He <bhe@redhat.com> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Zhongkun He <hezhongkun.hzk@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
70 lines
2.1 KiB
C
70 lines
2.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPILINUX_KEXEC_H
|
|
#define _UAPILINUX_KEXEC_H
|
|
|
|
/* kexec system call - It loads the new kernel to boot into.
|
|
* kexec does not sync, or unmount filesystems so if you need
|
|
* that to happen you need to do that yourself.
|
|
*/
|
|
|
|
#include <linux/types.h>
|
|
|
|
/* kexec flags for different usage scenarios */
|
|
#define KEXEC_ON_CRASH 0x00000001
|
|
#define KEXEC_PRESERVE_CONTEXT 0x00000002
|
|
#define KEXEC_UPDATE_ELFCOREHDR 0x00000004
|
|
#define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008
|
|
#define KEXEC_ARCH_MASK 0xffff0000
|
|
|
|
/*
|
|
* Kexec file load interface flags.
|
|
* KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image.
|
|
* KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image.
|
|
* KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd
|
|
* fd field.
|
|
*/
|
|
#define KEXEC_FILE_UNLOAD 0x00000001
|
|
#define KEXEC_FILE_ON_CRASH 0x00000002
|
|
#define KEXEC_FILE_NO_INITRAMFS 0x00000004
|
|
#define KEXEC_FILE_DEBUG 0x00000008
|
|
#define KEXEC_FILE_NO_CMA 0x00000010
|
|
|
|
/* These values match the ELF architecture values.
|
|
* Unless there is a good reason that should continue to be the case.
|
|
*/
|
|
#define KEXEC_ARCH_DEFAULT ( 0 << 16)
|
|
#define KEXEC_ARCH_386 ( 3 << 16)
|
|
#define KEXEC_ARCH_68K ( 4 << 16)
|
|
#define KEXEC_ARCH_PARISC (15 << 16)
|
|
#define KEXEC_ARCH_X86_64 (62 << 16)
|
|
#define KEXEC_ARCH_PPC (20 << 16)
|
|
#define KEXEC_ARCH_PPC64 (21 << 16)
|
|
#define KEXEC_ARCH_IA_64 (50 << 16)
|
|
#define KEXEC_ARCH_ARM (40 << 16)
|
|
#define KEXEC_ARCH_S390 (22 << 16)
|
|
#define KEXEC_ARCH_SH (42 << 16)
|
|
#define KEXEC_ARCH_MIPS_LE (10 << 16)
|
|
#define KEXEC_ARCH_MIPS ( 8 << 16)
|
|
#define KEXEC_ARCH_AARCH64 (183 << 16)
|
|
#define KEXEC_ARCH_RISCV (243 << 16)
|
|
#define KEXEC_ARCH_LOONGARCH (258 << 16)
|
|
|
|
/* The artificial cap on the number of segments passed to kexec_load. */
|
|
#define KEXEC_SEGMENT_MAX 16
|
|
|
|
#ifndef __KERNEL__
|
|
/*
|
|
* This structure is used to hold the arguments that are used when
|
|
* loading kernel binaries.
|
|
*/
|
|
struct kexec_segment {
|
|
const void *buf;
|
|
__kernel_size_t bufsz;
|
|
const void *mem;
|
|
__kernel_size_t memsz;
|
|
};
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _UAPILINUX_KEXEC_H */
|