mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-12-06 14:04:25 +00:00
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8@gmail.com>
Reported-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-By: Max Ramanouski <max8rr8@gmail.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
202 lines
6.1 KiB
C
202 lines
6.1 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* This file implements KASLR memory randomization for x86_64. It randomizes
|
|
* the virtual address space of kernel memory regions (physical memory
|
|
* mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates
|
|
* exploits relying on predictable kernel addresses.
|
|
*
|
|
* Entropy is generated using the KASLR early boot functions now shared in
|
|
* the lib directory (originally written by Kees Cook). Randomization is
|
|
* done on PGD & P4D/PUD page table levels to increase possible addresses.
|
|
* The physical memory mapping code was adapted to support P4D/PUD level
|
|
* virtual addresses. This implementation on the best configuration provides
|
|
* 30,000 possible virtual addresses in average for each memory region.
|
|
* An additional low memory page is used to ensure each CPU can start with
|
|
* a PGD aligned virtual address (for realmode).
|
|
*
|
|
* The order of each memory region is not changed. The feature looks at
|
|
* the available space for the regions based on different configuration
|
|
* options and randomizes the base and space between each. The size of the
|
|
* physical memory mapping is the available physical memory.
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/init.h>
|
|
#include <linux/random.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/pgtable.h>
|
|
|
|
#include <asm/setup.h>
|
|
#include <asm/kaslr.h>
|
|
|
|
#include "mm_internal.h"
|
|
|
|
#define TB_SHIFT 40
|
|
|
|
/*
|
|
* The end address could depend on more configuration options to make the
|
|
* highest amount of space for randomization available, but that's too hard
|
|
* to keep straight and caused issues already.
|
|
*/
|
|
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
|
|
|
|
/*
|
|
* Memory regions randomized by KASLR (except modules that use a separate logic
|
|
* earlier during boot). The list is ordered based on virtual addresses. This
|
|
* order is kept after randomization.
|
|
*/
|
|
static __initdata struct kaslr_memory_region {
|
|
unsigned long *base;
|
|
unsigned long *end;
|
|
unsigned long size_tb;
|
|
} kaslr_regions[] = {
|
|
{
|
|
.base = &page_offset_base,
|
|
.end = &physmem_end,
|
|
},
|
|
{
|
|
.base = &vmalloc_base,
|
|
},
|
|
{
|
|
.base = &vmemmap_base,
|
|
},
|
|
};
|
|
|
|
/* The end of the possible address space for physical memory */
|
|
unsigned long physmem_end __ro_after_init;
|
|
|
|
/* Get size in bytes used by the memory region */
|
|
static inline unsigned long get_padding(struct kaslr_memory_region *region)
|
|
{
|
|
return (region->size_tb << TB_SHIFT);
|
|
}
|
|
|
|
/* Initialize base and padding for each memory region randomized with KASLR */
|
|
void __init kernel_randomize_memory(void)
|
|
{
|
|
size_t i;
|
|
unsigned long vaddr_start, vaddr;
|
|
unsigned long rand, memory_tb;
|
|
struct rnd_state rand_state;
|
|
unsigned long remain_entropy;
|
|
unsigned long vmemmap_size;
|
|
|
|
vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
|
|
vaddr = vaddr_start;
|
|
|
|
/*
|
|
* These BUILD_BUG_ON checks ensure the memory layout is consistent
|
|
* with the vaddr_start/vaddr_end variables. These checks are very
|
|
* limited....
|
|
*/
|
|
BUILD_BUG_ON(vaddr_start >= vaddr_end);
|
|
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
|
|
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
|
|
|
|
/* Preset the end of the possible address space for physical memory */
|
|
physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
|
|
if (!kaslr_memory_enabled())
|
|
return;
|
|
|
|
kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
|
|
kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
|
|
|
|
/*
|
|
* Update Physical memory mapping to available and
|
|
* add padding if needed (especially for memory hotplug support).
|
|
*/
|
|
BUG_ON(kaslr_regions[0].base != &page_offset_base);
|
|
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
|
|
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
|
|
|
|
/* Adapt physical memory region size based on available memory */
|
|
if (memory_tb < kaslr_regions[0].size_tb)
|
|
kaslr_regions[0].size_tb = memory_tb;
|
|
|
|
/*
|
|
* Calculate the vmemmap region size in TBs, aligned to a TB
|
|
* boundary.
|
|
*/
|
|
vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) *
|
|
sizeof(struct page);
|
|
kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT);
|
|
|
|
/* Calculate entropy available between regions */
|
|
remain_entropy = vaddr_end - vaddr_start;
|
|
for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
|
|
remain_entropy -= get_padding(&kaslr_regions[i]);
|
|
|
|
prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));
|
|
|
|
for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) {
|
|
unsigned long entropy;
|
|
|
|
/*
|
|
* Select a random virtual address using the extra entropy
|
|
* available.
|
|
*/
|
|
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
|
|
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
|
|
entropy = (rand % (entropy + 1)) & PUD_MASK;
|
|
vaddr += entropy;
|
|
*kaslr_regions[i].base = vaddr;
|
|
|
|
/* Calculate the end of the region */
|
|
vaddr += get_padding(&kaslr_regions[i]);
|
|
/*
|
|
* KASLR trims the maximum possible size of the
|
|
* direct-map. Update the physmem_end boundary.
|
|
* No rounding required as the region starts
|
|
* PUD aligned and size is in units of TB.
|
|
*/
|
|
if (kaslr_regions[i].end)
|
|
*kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
|
|
|
|
/* Add a minimum padding based on randomization alignment. */
|
|
vaddr = round_up(vaddr + 1, PUD_SIZE);
|
|
remain_entropy -= entropy;
|
|
}
|
|
}
|
|
|
|
void __meminit init_trampoline_kaslr(void)
|
|
{
|
|
pud_t *pud_page_tramp, *pud, *pud_tramp;
|
|
p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
|
|
unsigned long paddr, vaddr;
|
|
pgd_t *pgd;
|
|
|
|
pud_page_tramp = alloc_low_page();
|
|
|
|
/*
|
|
* There are two mappings for the low 1MB area, the direct mapping
|
|
* and the 1:1 mapping for the real mode trampoline:
|
|
*
|
|
* Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
|
|
* 1:1 mapping: virt_addr = phys_addr
|
|
*/
|
|
paddr = 0;
|
|
vaddr = (unsigned long)__va(paddr);
|
|
pgd = pgd_offset_k(vaddr);
|
|
|
|
p4d = p4d_offset(pgd, vaddr);
|
|
pud = pud_offset(p4d, vaddr);
|
|
|
|
pud_tramp = pud_page_tramp + pud_index(paddr);
|
|
*pud_tramp = *pud;
|
|
|
|
if (pgtable_l5_enabled()) {
|
|
p4d_page_tramp = alloc_low_page();
|
|
|
|
p4d_tramp = p4d_page_tramp + p4d_index(paddr);
|
|
|
|
set_p4d(p4d_tramp,
|
|
__p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
|
|
|
|
trampoline_pgd_entry =
|
|
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp));
|
|
} else {
|
|
trampoline_pgd_entry =
|
|
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
|
|
}
|
|
}
|