mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2026-01-06 03:53:44 +00:00
The current CR3 handling for kernel page table isolation in the paranoid return paths which are relevant for #NMI, #MCE, #VC, #DB and #DF is unconditionally writing CR3 with the value retrieved on exception entry. In the vast majority of cases when returning to the kernel this is a pointless exercise because CR3 was not modified on exception entry. The only situation where this is necessary is when the exception interrupts a entry from user before switching to kernel CR3 or interrupts an exit to user after switching back to user CR3. As CR3 writes can be expensive on some systems this becomes measurable overhead with high frequency #NMIs such as perf. Avoid this overhead by checking the CR3 value, which was saved on entry, and write it back to CR3 only when it us a user CR3. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmXvTXYTHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoYMED/40YXFa0si5/9LRh/LSYglxVe/RaXCn 3oU19oWFRxdHCCLYHeQdlQGrpugM773X+4EC1dE92QpYjFnuLhl5H10h3t2e+3Uw Q2VoWEo95FuJ2v7nqex7p2pglOvNjT2VBBlcFFdhqxiC1FCupXvU17nCcLeBsPkj wbY2Sq4DxPDoWhWMNK2jhCQNVyYYluJERylS5+j0CK8vhQghq1N1WjcB6tQiAYsa 7nXz2ZJeGF0jnvLanyhAVSHDKU7QOMO3zkQpaaMlGQ9izawupe5/Gbi8ouFieCh+ xoLnGo1sgtMOXInnYaJnCiwuc+WiVN3d83aO/s7NZi8ZF60ib72xhzsRip2Cu4aV kBtJaCVLFItQZ81HRSBABj6s9MLphHVm4AaOCvCIxK0ib5KDFaWy3tZpwTU4dvwX rcwKsQrSLlOOD5zqO5dZn+HX6hK2lsNeTPLfcKVqARGn5S9fITzYbUMlkhO/FGaj ZhIgadH8+rXwFDbgS6CGbVYKtM6Ncf/VBGFfE7tEOUQVUmLws3pdLiWo6I2QTGtw fCAeF9uYmvhtiKk0e2jotZdbAg6HP2XTQSZfBxQpRgY6AnYW+XyDezcN0X1eNMJC lmNC72WYxURHZUoOIxiiVzDS9kz7YTUo3pBHFrpQlNqGTqP8r+tAhUyou16yDK/0 2G9Mms/85u89MQ== =UcMe -----END PGP SIGNATURE----- Merge tag 'x86-entry-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 entry update from Thomas Gleixner: "A single update for the x86 entry code: The current CR3 handling for kernel page table isolation in the paranoid return paths which are relevant for #NMI, #MCE, #VC, #DB and #DF is unconditionally writing CR3 with the value retrieved on exception entry. In the vast majority of cases when returning to the kernel this is a pointless exercise because CR3 was not modified on exception entry. The only situation where this is necessary is when the exception interrupts a entry from user before switching to kernel CR3 or interrupts an exit to user after switching back to user CR3. As CR3 writes can be expensive on some systems this becomes measurable overhead with high frequency #NMIs such as perf. Avoid this overhead by checking the CR3 value, which was saved on entry, and write it back to CR3 only when it is a user CR3" * tag 'x86-entry-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/entry: Avoid redundant CR3 write on paranoid returns
428 lines
12 KiB
C
428 lines
12 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#include <linux/jump_label.h>
|
|
#include <asm/unwind_hints.h>
|
|
#include <asm/cpufeatures.h>
|
|
#include <asm/page_types.h>
|
|
#include <asm/percpu.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/processor-flags.h>
|
|
#include <asm/ptrace-abi.h>
|
|
#include <asm/msr.h>
|
|
#include <asm/nospec-branch.h>
|
|
|
|
/*
|
|
|
|
x86 function call convention, 64-bit:
|
|
-------------------------------------
|
|
arguments | callee-saved | extra caller-saved | return
|
|
[callee-clobbered] | | [callee-clobbered] |
|
|
---------------------------------------------------------------------------
|
|
rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
|
|
|
|
( rsp is obviously invariant across normal function calls. (gcc can 'merge'
|
|
functions when it sees tail-call optimization possibilities) rflags is
|
|
clobbered. Leftover arguments are passed over the stack frame.)
|
|
|
|
[*] In the frame-pointers case rbp is fixed to the stack frame.
|
|
|
|
[**] for struct return values wider than 64 bits the return convention is a
|
|
bit more complex: up to 128 bits width we return small structures
|
|
straight in rax, rdx. For structures larger than that (3 words or
|
|
larger) the caller puts a pointer to an on-stack return struct
|
|
[allocated in the caller's stack frame] into the first argument - i.e.
|
|
into rdi. All other arguments shift up by one in this case.
|
|
Fortunately this case is rare in the kernel.
|
|
|
|
For 32-bit we have the following conventions - kernel is built with
|
|
-mregparm=3 and -freg-struct-return:
|
|
|
|
x86 function calling convention, 32-bit:
|
|
----------------------------------------
|
|
arguments | callee-saved | extra caller-saved | return
|
|
[callee-clobbered] | | [callee-clobbered] |
|
|
-------------------------------------------------------------------------
|
|
eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
|
|
|
|
( here too esp is obviously invariant across normal function calls. eflags
|
|
is clobbered. Leftover arguments are passed over the stack frame. )
|
|
|
|
[*] In the frame-pointers case ebp is fixed to the stack frame.
|
|
|
|
[**] We build with -freg-struct-return, which on 32-bit means similar
|
|
semantics as on 64-bit: edx can be used for a second return value
|
|
(i.e. covering integer and structure sizes up to 64 bits) - after that
|
|
it gets more complex and more expensive: 3-word or larger struct returns
|
|
get done in the caller's frame and the pointer to the return struct goes
|
|
into regparm0, i.e. eax - the other arguments shift up and the
|
|
function's register parameters degenerate to regparm=2 in essence.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
/*
|
|
* 64-bit system call stack frame layout defines and helpers,
|
|
* for assembly code:
|
|
*/
|
|
|
|
.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1
|
|
.if \save_ret
|
|
pushq %rsi /* pt_regs->si */
|
|
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
|
|
movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */
|
|
.else
|
|
pushq %rdi /* pt_regs->di */
|
|
pushq %rsi /* pt_regs->si */
|
|
.endif
|
|
pushq \rdx /* pt_regs->dx */
|
|
pushq \rcx /* pt_regs->cx */
|
|
pushq \rax /* pt_regs->ax */
|
|
pushq %r8 /* pt_regs->r8 */
|
|
pushq %r9 /* pt_regs->r9 */
|
|
pushq %r10 /* pt_regs->r10 */
|
|
pushq %r11 /* pt_regs->r11 */
|
|
pushq %rbx /* pt_regs->rbx */
|
|
pushq %rbp /* pt_regs->rbp */
|
|
pushq %r12 /* pt_regs->r12 */
|
|
pushq %r13 /* pt_regs->r13 */
|
|
pushq %r14 /* pt_regs->r14 */
|
|
pushq %r15 /* pt_regs->r15 */
|
|
|
|
.if \unwind_hint
|
|
UNWIND_HINT_REGS
|
|
.endif
|
|
|
|
.if \save_ret
|
|
pushq %rsi /* return address on top of stack */
|
|
.endif
|
|
.endm
|
|
|
|
.macro CLEAR_REGS clear_bp=1
|
|
/*
|
|
* Sanitize registers of values that a speculation attack might
|
|
* otherwise want to exploit. The lower registers are likely clobbered
|
|
* well before they could be put to use in a speculative execution
|
|
* gadget.
|
|
*/
|
|
xorl %esi, %esi /* nospec si */
|
|
xorl %edx, %edx /* nospec dx */
|
|
xorl %ecx, %ecx /* nospec cx */
|
|
xorl %r8d, %r8d /* nospec r8 */
|
|
xorl %r9d, %r9d /* nospec r9 */
|
|
xorl %r10d, %r10d /* nospec r10 */
|
|
xorl %r11d, %r11d /* nospec r11 */
|
|
xorl %ebx, %ebx /* nospec rbx */
|
|
.if \clear_bp
|
|
xorl %ebp, %ebp /* nospec rbp */
|
|
.endif
|
|
xorl %r12d, %r12d /* nospec r12 */
|
|
xorl %r13d, %r13d /* nospec r13 */
|
|
xorl %r14d, %r14d /* nospec r14 */
|
|
xorl %r15d, %r15d /* nospec r15 */
|
|
|
|
.endm
|
|
|
|
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1
|
|
PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
|
|
CLEAR_REGS clear_bp=\clear_bp
|
|
.endm
|
|
|
|
.macro POP_REGS pop_rdi=1
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbp
|
|
popq %rbx
|
|
popq %r11
|
|
popq %r10
|
|
popq %r9
|
|
popq %r8
|
|
popq %rax
|
|
popq %rcx
|
|
popq %rdx
|
|
popq %rsi
|
|
.if \pop_rdi
|
|
popq %rdi
|
|
.endif
|
|
.endm
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
/*
|
|
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
|
|
* halves:
|
|
*/
|
|
#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
|
|
#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
|
|
#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
|
|
#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
|
|
#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
|
|
|
|
.macro SET_NOFLUSH_BIT reg:req
|
|
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
|
|
.endm
|
|
|
|
.macro ADJUST_KERNEL_CR3 reg:req
|
|
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
|
|
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
|
|
andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
|
|
.endm
|
|
|
|
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
|
mov %cr3, \scratch_reg
|
|
ADJUST_KERNEL_CR3 \scratch_reg
|
|
mov \scratch_reg, %cr3
|
|
.Lend_\@:
|
|
.endm
|
|
|
|
#define THIS_CPU_user_pcid_flush_mask \
|
|
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
|
|
|
|
.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
|
|
mov %cr3, \scratch_reg
|
|
|
|
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
|
|
|
|
/*
|
|
* Test if the ASID needs a flush.
|
|
*/
|
|
movq \scratch_reg, \scratch_reg2
|
|
andq $(0x7FF), \scratch_reg /* mask ASID */
|
|
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
|
jnc .Lnoflush_\@
|
|
|
|
/* Flush needed, clear the bit */
|
|
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
|
movq \scratch_reg2, \scratch_reg
|
|
jmp .Lwrcr3_pcid_\@
|
|
|
|
.Lnoflush_\@:
|
|
movq \scratch_reg2, \scratch_reg
|
|
SET_NOFLUSH_BIT \scratch_reg
|
|
|
|
.Lwrcr3_pcid_\@:
|
|
/* Flip the ASID to the user version */
|
|
orq $(PTI_USER_PCID_MASK), \scratch_reg
|
|
|
|
.Lwrcr3_\@:
|
|
/* Flip the PGD to the user version */
|
|
orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
|
|
mov \scratch_reg, %cr3
|
|
.endm
|
|
|
|
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
|
SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2
|
|
.Lend_\@:
|
|
.endm
|
|
|
|
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
|
pushq %rax
|
|
SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax
|
|
popq %rax
|
|
.Lend_\@:
|
|
.endm
|
|
|
|
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
|
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
|
|
movq %cr3, \scratch_reg
|
|
movq \scratch_reg, \save_reg
|
|
/*
|
|
* Test the user pagetable bit. If set, then the user page tables
|
|
* are active. If clear CR3 already has the kernel page table
|
|
* active.
|
|
*/
|
|
bt $PTI_USER_PGTABLE_BIT, \scratch_reg
|
|
jnc .Ldone_\@
|
|
|
|
ADJUST_KERNEL_CR3 \scratch_reg
|
|
movq \scratch_reg, %cr3
|
|
|
|
.Ldone_\@:
|
|
.endm
|
|
|
|
/* Restore CR3 from a kernel context. May restore a user CR3 value. */
|
|
.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
|
|
|
/*
|
|
* If CR3 contained the kernel page tables at the paranoid exception
|
|
* entry, then there is nothing to restore as CR3 is not modified while
|
|
* handling the exception.
|
|
*/
|
|
bt $PTI_USER_PGTABLE_BIT, \save_reg
|
|
jnc .Lend_\@
|
|
|
|
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
|
|
|
|
/*
|
|
* Check if there's a pending flush for the user ASID we're
|
|
* about to set.
|
|
*/
|
|
movq \save_reg, \scratch_reg
|
|
andq $(0x7FF), \scratch_reg
|
|
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
|
jc .Lwrcr3_\@
|
|
|
|
SET_NOFLUSH_BIT \save_reg
|
|
|
|
.Lwrcr3_\@:
|
|
movq \save_reg, %cr3
|
|
.Lend_\@:
|
|
.endm
|
|
|
|
#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
|
|
|
|
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
|
.endm
|
|
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
|
.endm
|
|
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
|
.endm
|
|
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
|
.endm
|
|
.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
|
|
.endm
|
|
|
|
#endif
|
|
|
|
/*
|
|
* IBRS kernel mitigation for Spectre_v2.
|
|
*
|
|
* Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers
|
|
* the regs it uses (AX, CX, DX). Must be called before the first RET
|
|
* instruction (NOTE! UNTRAIN_RET includes a RET instruction)
|
|
*
|
|
* The optional argument is used to save/restore the current value,
|
|
* which is used on the paranoid paths.
|
|
*
|
|
* Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
|
|
*/
|
|
.macro IBRS_ENTER save_reg
|
|
#ifdef CONFIG_CPU_IBRS_ENTRY
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
|
|
movl $MSR_IA32_SPEC_CTRL, %ecx
|
|
|
|
.ifnb \save_reg
|
|
rdmsr
|
|
shl $32, %rdx
|
|
or %rdx, %rax
|
|
mov %rax, \save_reg
|
|
test $SPEC_CTRL_IBRS, %eax
|
|
jz .Ldo_wrmsr_\@
|
|
lfence
|
|
jmp .Lend_\@
|
|
.Ldo_wrmsr_\@:
|
|
.endif
|
|
|
|
movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
|
|
movl %edx, %eax
|
|
shr $32, %rdx
|
|
wrmsr
|
|
.Lend_\@:
|
|
#endif
|
|
.endm
|
|
|
|
/*
|
|
* Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX)
|
|
* regs. Must be called after the last RET.
|
|
*/
|
|
.macro IBRS_EXIT save_reg
|
|
#ifdef CONFIG_CPU_IBRS_ENTRY
|
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
|
|
movl $MSR_IA32_SPEC_CTRL, %ecx
|
|
|
|
.ifnb \save_reg
|
|
mov \save_reg, %rdx
|
|
.else
|
|
movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
|
|
andl $(~SPEC_CTRL_IBRS), %edx
|
|
.endif
|
|
|
|
movl %edx, %eax
|
|
shr $32, %rdx
|
|
wrmsr
|
|
.Lend_\@:
|
|
#endif
|
|
.endm
|
|
|
|
/*
|
|
* Mitigate Spectre v1 for conditional swapgs code paths.
|
|
*
|
|
* FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
|
|
* prevent a speculative swapgs when coming from kernel space.
|
|
*
|
|
* FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
|
|
* to prevent the swapgs from getting speculatively skipped when coming from
|
|
* user space.
|
|
*/
|
|
.macro FENCE_SWAPGS_USER_ENTRY
|
|
ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
|
|
.endm
|
|
.macro FENCE_SWAPGS_KERNEL_ENTRY
|
|
ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
|
|
.endm
|
|
|
|
.macro STACKLEAK_ERASE_NOCLOBBER
|
|
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
|
PUSH_AND_CLEAR_REGS
|
|
call stackleak_erase
|
|
POP_REGS
|
|
#endif
|
|
.endm
|
|
|
|
.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
|
|
rdgsbase \save_reg
|
|
GET_PERCPU_BASE \scratch_reg
|
|
wrgsbase \scratch_reg
|
|
.endm
|
|
|
|
#else /* CONFIG_X86_64 */
|
|
# undef UNWIND_HINT_IRET_REGS
|
|
# define UNWIND_HINT_IRET_REGS
|
|
#endif /* !CONFIG_X86_64 */
|
|
|
|
.macro STACKLEAK_ERASE
|
|
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
|
call stackleak_erase
|
|
#endif
|
|
.endm
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
* CPU/node NR is loaded from the limit (size) field of a special segment
|
|
* descriptor entry in GDT.
|
|
*/
|
|
.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
|
|
movq $__CPUNODE_SEG, \reg
|
|
lsl \reg, \reg
|
|
.endm
|
|
|
|
/*
|
|
* Fetch the per-CPU GSBASE value for this processor and put it in @reg.
|
|
* We normally use %gs for accessing per-CPU data, but we are setting up
|
|
* %gs here and obviously can not use %gs itself to access per-CPU data.
|
|
*
|
|
* Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and
|
|
* may not restore the host's value until the CPU returns to userspace.
|
|
* Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
|
|
* while running KVM's run loop.
|
|
*/
|
|
.macro GET_PERCPU_BASE reg:req
|
|
LOAD_CPU_AND_NODE_SEG_LIMIT \reg
|
|
andq $VDSO_CPUNODE_MASK, \reg
|
|
movq __per_cpu_offset(, \reg, 8), \reg
|
|
.endm
|
|
|
|
#else
|
|
|
|
.macro GET_PERCPU_BASE reg:req
|
|
movq pcpu_unit_offsets(%rip), \reg
|
|
.endm
|
|
|
|
#endif /* CONFIG_SMP */
|