mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-31 22:23:05 +00:00

Patch series "kexec: introduce Kexec HandOver (KHO)", v8. Kexec today considers itself purely a boot loader: When we enter the new kernel, any state the previous kernel left behind is irrelevant and the new kernel reinitializes the system. However, there are use cases where this mode of operation is not what we actually want. In virtualization hosts for example, we want to use kexec to update the host kernel while virtual machine memory stays untouched. When we add device assignment to the mix, we also need to ensure that IOMMU and VFIO states are untouched. If we add PCIe peer to peer DMA, we need to do the same for the PCI subsystem. If we want to kexec while an SEV-SNP enabled virtual machine is running, we need to preserve the VM context pages and physical memory. See "pkernfs: Persisting guest memory and kernel/device state safely across kexec" Linux Plumbers Conference 2023 presentation for details: https://lpc.events/event/17/contributions/1485/ To start us on the journey to support all the use cases above, this patch implements basic infrastructure to allow hand over of kernel state across kexec (Kexec HandOver, aka KHO). As a really simple example target, we use memblock's reserve_mem. With this patchset applied, memory that was reserved using "reserve_mem" command line options remains intact after kexec and it is guaranteed to reside at the same physical address. == Alternatives == There are alternative approaches to (parts of) the problems above: * Memory Pools [1] - preallocated persistent memory region + allocator * PRMEM [2] - resizable persistent memory regions with fixed metadata pointer on the kernel command line + allocator * Pkernfs [3] - preallocated file system for in-kernel data with fixed address location on the kernel command line * PKRAM [4] - handover of user space pages using a fixed metadata page specified via command line All of the approaches above fundamentally have the same problem: They require the administrator to explicitly carve out a physical memory location because they have no mechanism outside of the kernel command line to pass data (including memory reservations) between kexec'ing kernels. KHO provides that base foundation. We will determine later whether we still need any of the approaches above for fast bulk memory handover of for example IOMMU page tables. But IMHO they would all be users of KHO, with KHO providing the foundational primitive to pass metadata and bulk memory reservations as well as provide easy versioning for data. == Overview == We introduce a metadata file that the kernels pass between each other. How they pass it is architecture specific. The file's format is a Flattened Device Tree (fdt) which has a generator and parser already included in Linux. KHO is enabled in the kernel command line by `kho=on`. When the root user enables KHO through /sys/kernel/debug/kho/out/finalize, the kernel invokes callbacks to every KHO users to register preserved memory regions, which contain drivers' states. When the actual kexec happens, the fdt is part of the image set that we boot into. In addition, we keep "scratch regions" available for kexec: physically contiguous memory regions that are guaranteed to not have any memory that KHO would preserve. The new kernel bootstraps itself using the scratch regions and sets all handed over memory as in use. When drivers initialize that support KHO, they introspect the fdt, restore preserved memory regions, and retrieve their states stored in the preserved memory. == Limitations == Currently KHO is only implemented for file based kexec. The kernel interfaces in the patch set are already in place to support user space kexec as well, but it is still not implemented it yet inside kexec tools. == How to Use == To use the code, please boot the kernel with the "kho=on" command line parameter. KHO will automatically create scratch regions. If you want to set the scratch size explicitly you can use "kho_scratch=" command line parameter. For instance, "kho_scratch=16M,512M,256M" will reserve a 16 MiB low memory scratch area, a 512 MiB global scratch region, and 256 MiB per NUMA node scratch regions on boot. Make sure to have a reserved memory range requested with reserv_mem command line option, for example, "reserve_mem=64m:4k:n1". Then before you invoke file based "kexec -l", finalize KHO FDT: # echo 1 > /sys/kernel/debug/kho/out/finalize You can preview the generated FDT using `dtc`, # dtc /sys/kernel/debug/kho/out/fdt # dtc /sys/kernel/debug/kho/out/sub_fdts/memblock `dtc` is available on ubuntu by `sudo apt-get install device-tree-compiler`. Now kexec into the new kernel, # kexec -l Image --initrd=initrd -s # kexec -e (The order of KHO finalization and "kexec -l" does not matter.) The new kernel will boot up and contain the previous kernel's reserve_mem contents at the same physical address as the first kernel. You can also review the FDT passed from the old kernel, # dtc /sys/kernel/debug/kho/in/fdt # dtc /sys/kernel/debug/kho/in/sub_fdts/memblock This patch (of 17): To denote areas that were reserved for kernel use either directly with memblock_reserve_kern() or via memblock allocations. Link: https://lore.kernel.org/lkml/20250424083258.2228122-1-changyuanl@google.com/ Link: https://lore.kernel.org/lkml/aAeaJ2iqkrv_ffhT@kernel.org/ Link: https://lore.kernel.org/lkml/35c58191-f774-40cf-8d66-d1e2aaf11a62@intel.com/ Link: https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/ Link: https://lkml.kernel.org/r/20250509074635.3187114-1-changyuanl@google.com Link: https://lkml.kernel.org/r/20250509074635.3187114-2-changyuanl@google.com Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Co-developed-by: Changyuan Lyu <changyuanl@google.com> Signed-off-by: Changyuan Lyu <changyuanl@google.com> Cc: Alexander Graf <graf@amazon.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Anthony Yznaga <anthony.yznaga@oracle.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Ashish Kalra <ashish.kalra@amd.com> Cc: Ben Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Betkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Eric Biederman <ebiederm@xmission.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Gowans <jgowans@amazon.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Krzysztof Kozlowski <krzk@kernel.org> Cc: Marc Rutland <mark.rutland@arm.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Pratyush Yadav <ptyadav@amazon.de> Cc: Rob Herring <robh@kernel.org> Cc: Saravana Kannan <saravanak@google.com> Cc: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleinxer <tglx@linutronix.de> Cc: Thomas Lendacky <thomas.lendacky@amd.com> Cc: Will Deacon <will@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
415 lines
10 KiB
C
415 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
#include "alloc_helpers_api.h"
|
|
|
|
/*
|
|
* A simple test that tries to allocate a memory region above a specified,
|
|
* aligned address:
|
|
*
|
|
* +
|
|
* | +-----------+ |
|
|
* | | rgn | |
|
|
* +----------+-----------+---------+
|
|
* ^
|
|
* |
|
|
* Aligned min_addr
|
|
*
|
|
* Expect to allocate a cleared region at the minimal memory address.
|
|
*/
|
|
static int alloc_from_simple_generic_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t size = SZ_16;
|
|
phys_addr_t min_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES;
|
|
|
|
allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_MEM_EQ(allocated_ptr, 0, size);
|
|
|
|
ASSERT_EQ(rgn->size, size);
|
|
ASSERT_EQ(rgn->base, min_addr);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region above a certain address.
|
|
* The minimal address here is not aligned:
|
|
*
|
|
* + +
|
|
* | + +---------+ |
|
|
* | | | rgn | |
|
|
* +------+------+---------+------------+
|
|
* ^ ^------.
|
|
* | |
|
|
* min_addr Aligned address
|
|
* boundary
|
|
*
|
|
* Expect to allocate a cleared region at the closest aligned memory address.
|
|
*/
|
|
static int alloc_from_misaligned_generic_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t size = SZ_32;
|
|
phys_addr_t min_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
/* A misaligned address */
|
|
min_addr = memblock_end_of_DRAM() - (SMP_CACHE_BYTES * 2 - 1);
|
|
|
|
allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_MEM_EQ(allocated_ptr, 0, size);
|
|
|
|
ASSERT_EQ(rgn->size, size);
|
|
ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region above an address that is too
|
|
* close to the end of the memory:
|
|
*
|
|
* + +
|
|
* | +--------+---+ |
|
|
* | | rgn + | |
|
|
* +-----------+--------+---+------+
|
|
* ^ ^
|
|
* | |
|
|
* | min_addr
|
|
* |
|
|
* Aligned address
|
|
* boundary
|
|
*
|
|
* Expect to prioritize granting memory over satisfying the minimal address
|
|
* requirement.
|
|
*/
|
|
static int alloc_from_top_down_high_addr_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t size = SZ_32;
|
|
phys_addr_t min_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
/* The address is too close to the end of the memory */
|
|
min_addr = memblock_end_of_DRAM() - SZ_16;
|
|
|
|
allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->size, size);
|
|
ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region when there is no space
|
|
* available above the minimal address above a certain address:
|
|
*
|
|
* +
|
|
* | +---------+-------------|
|
|
* | | rgn | |
|
|
* +--------+---------+-------------+
|
|
* ^
|
|
* |
|
|
* min_addr
|
|
*
|
|
* Expect to prioritize granting memory over satisfying the minimal address
|
|
* requirement and to allocate next to the previously reserved region. The
|
|
* regions get merged into one.
|
|
*/
|
|
static int alloc_from_top_down_no_space_above_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t r1_size = SZ_64;
|
|
phys_addr_t r2_size = SZ_2;
|
|
phys_addr_t total_size = r1_size + r2_size;
|
|
phys_addr_t min_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES * 2;
|
|
|
|
/* No space above this address */
|
|
memblock_reserve_kern(min_addr, r2_size);
|
|
|
|
allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->base, min_addr - r1_size);
|
|
ASSERT_EQ(rgn->size, total_size);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, total_size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region with a minimal address below
|
|
* the start address of the available memory. As the allocation is top-down,
|
|
* first reserve a region that will force allocation near the start.
|
|
* Expect successful allocation and merge of both regions.
|
|
*/
|
|
static int alloc_from_top_down_min_addr_cap_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t r1_size = SZ_64;
|
|
phys_addr_t min_addr;
|
|
phys_addr_t start_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
start_addr = (phys_addr_t)memblock_start_of_DRAM();
|
|
min_addr = start_addr - SMP_CACHE_BYTES * 3;
|
|
|
|
memblock_reserve_kern(start_addr + r1_size, MEM_SIZE - r1_size);
|
|
|
|
allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->base, start_addr);
|
|
ASSERT_EQ(rgn->size, MEM_SIZE);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, MEM_SIZE);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region above an address that is too
|
|
* close to the end of the memory:
|
|
*
|
|
* +
|
|
* |-----------+ + |
|
|
* | rgn | | |
|
|
* +-----------+--------------+-----+
|
|
* ^ ^
|
|
* | |
|
|
* Aligned address min_addr
|
|
* boundary
|
|
*
|
|
* Expect to prioritize granting memory over satisfying the minimal address
|
|
* requirement. Allocation happens at beginning of the available memory.
|
|
*/
|
|
static int alloc_from_bottom_up_high_addr_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t size = SZ_32;
|
|
phys_addr_t min_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
/* The address is too close to the end of the memory */
|
|
min_addr = memblock_end_of_DRAM() - SZ_8;
|
|
|
|
allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->size, size);
|
|
ASSERT_EQ(rgn->base, memblock_start_of_DRAM());
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region when there is no space
|
|
* available above the minimal address above a certain address:
|
|
*
|
|
* +
|
|
* |-----------+ +-------------------|
|
|
* | rgn | | |
|
|
* +-----------+----+-------------------+
|
|
* ^
|
|
* |
|
|
* min_addr
|
|
*
|
|
* Expect to prioritize granting memory over satisfying the minimal address
|
|
* requirement and to allocate at the beginning of the available memory.
|
|
*/
|
|
static int alloc_from_bottom_up_no_space_above_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t r1_size = SZ_64;
|
|
phys_addr_t min_addr;
|
|
phys_addr_t r2_size;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
min_addr = memblock_start_of_DRAM() + SZ_128;
|
|
r2_size = memblock_end_of_DRAM() - min_addr;
|
|
|
|
/* No space above this address */
|
|
memblock_reserve(min_addr - SMP_CACHE_BYTES, r2_size);
|
|
|
|
allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->base, memblock_start_of_DRAM());
|
|
ASSERT_EQ(rgn->size, r1_size);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 2);
|
|
ASSERT_EQ(memblock.reserved.total_size, r1_size + r2_size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A test that tries to allocate a memory region with a minimal address below
|
|
* the start address of the available memory. Expect to allocate a region
|
|
* at the beginning of the available memory.
|
|
*/
|
|
static int alloc_from_bottom_up_min_addr_cap_check(void)
|
|
{
|
|
struct memblock_region *rgn = &memblock.reserved.regions[0];
|
|
void *allocated_ptr = NULL;
|
|
phys_addr_t r1_size = SZ_64;
|
|
phys_addr_t min_addr;
|
|
phys_addr_t start_addr;
|
|
|
|
PREFIX_PUSH();
|
|
setup_memblock();
|
|
|
|
start_addr = (phys_addr_t)memblock_start_of_DRAM();
|
|
min_addr = start_addr - SMP_CACHE_BYTES * 3;
|
|
|
|
allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr);
|
|
|
|
ASSERT_NE(allocated_ptr, NULL);
|
|
ASSERT_EQ(rgn->base, start_addr);
|
|
ASSERT_EQ(rgn->size, r1_size);
|
|
|
|
ASSERT_EQ(memblock.reserved.cnt, 1);
|
|
ASSERT_EQ(memblock.reserved.total_size, r1_size);
|
|
|
|
test_pass_pop();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Test case wrappers */
|
|
static int alloc_from_simple_check(void)
|
|
{
|
|
test_print("\tRunning %s...\n", __func__);
|
|
run_top_down(alloc_from_simple_generic_check);
|
|
run_bottom_up(alloc_from_simple_generic_check);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int alloc_from_misaligned_check(void)
|
|
{
|
|
test_print("\tRunning %s...\n", __func__);
|
|
run_top_down(alloc_from_misaligned_generic_check);
|
|
run_bottom_up(alloc_from_misaligned_generic_check);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int alloc_from_high_addr_check(void)
|
|
{
|
|
test_print("\tRunning %s...\n", __func__);
|
|
memblock_set_bottom_up(false);
|
|
alloc_from_top_down_high_addr_check();
|
|
memblock_set_bottom_up(true);
|
|
alloc_from_bottom_up_high_addr_check();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int alloc_from_no_space_above_check(void)
|
|
{
|
|
test_print("\tRunning %s...\n", __func__);
|
|
memblock_set_bottom_up(false);
|
|
alloc_from_top_down_no_space_above_check();
|
|
memblock_set_bottom_up(true);
|
|
alloc_from_bottom_up_no_space_above_check();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int alloc_from_min_addr_cap_check(void)
|
|
{
|
|
test_print("\tRunning %s...\n", __func__);
|
|
memblock_set_bottom_up(false);
|
|
alloc_from_top_down_min_addr_cap_check();
|
|
memblock_set_bottom_up(true);
|
|
alloc_from_bottom_up_min_addr_cap_check();
|
|
|
|
return 0;
|
|
}
|
|
|
|
int memblock_alloc_helpers_checks(void)
|
|
{
|
|
const char *func_testing = "memblock_alloc_from";
|
|
|
|
prefix_reset();
|
|
prefix_push(func_testing);
|
|
test_print("Running %s tests...\n", func_testing);
|
|
|
|
reset_memblock_attributes();
|
|
dummy_physical_memory_init();
|
|
|
|
alloc_from_simple_check();
|
|
alloc_from_misaligned_check();
|
|
alloc_from_high_addr_check();
|
|
alloc_from_no_space_above_check();
|
|
alloc_from_min_addr_cap_check();
|
|
|
|
dummy_physical_memory_cleanup();
|
|
|
|
prefix_pop();
|
|
|
|
return 0;
|
|
}
|