mirror of
https://git.proxmox.com/git/pve-kernel
synced 2025-04-28 11:21:12 +00:00
backport fix for kvm performance regression with Intel Emerald Rapids
Adapted to context change in "arch/x86/kvm/cpuid.h", because of the vcpu_supports_xsave_pkru() function that got added by Proxmox VE downstream patch "kvm: xsave set: mask-out PKRU bit in xfeatures if vCPU has no support". But otherwise clean cherry-pick from linux-next, no functional changes. Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
This commit is contained in:
parent
f3ec9c1f62
commit
24bfa44438
@ -0,0 +1,165 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Sean Christopherson <seanjc@google.com>
|
||||
Date: Tue, 10 Dec 2024 17:32:58 -0800
|
||||
Subject: [PATCH] KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module
|
||||
init
|
||||
|
||||
Snapshot the output of CPUID.0xD.[1..n] during kvm.ko initiliaization to
|
||||
avoid the overead of CPUID during runtime. The offset, size, and metadata
|
||||
for CPUID.0xD.[1..n] sub-leaves does not depend on XCR0 or XSS values, i.e.
|
||||
is constant for a given CPU, and thus can be cached during module load.
|
||||
|
||||
On Intel's Emerald Rapids, CPUID is *wildly* expensive, to the point where
|
||||
recomputing XSAVE offsets and sizes results in a 4x increase in latency of
|
||||
nested VM-Enter and VM-Exit (nested transitions can trigger
|
||||
xstate_required_size() multiple times per transition), relative to using
|
||||
cached values. The issue is easily visible by running `perf top` while
|
||||
triggering nested transitions: kvm_update_cpuid_runtime() shows up at a
|
||||
whopping 50%.
|
||||
|
||||
As measured via RDTSC from L2 (using KVM-Unit-Test's CPUID VM-Exit test
|
||||
and a slightly modified L1 KVM to handle CPUID in the fastpath), a nested
|
||||
roundtrip to emulate CPUID on Skylake (SKX), Icelake (ICX), and Emerald
|
||||
Rapids (EMR) takes:
|
||||
|
||||
SKX 11650
|
||||
ICX 22350
|
||||
EMR 28850
|
||||
|
||||
Using cached values, the latency drops to:
|
||||
|
||||
SKX 6850
|
||||
ICX 9000
|
||||
EMR 7900
|
||||
|
||||
The underlying issue is that CPUID itself is slow on ICX, and comically
|
||||
slow on EMR. The problem is exacerbated on CPUs which support XSAVES
|
||||
and/or XSAVEC, as KVM invokes xstate_required_size() twice on each
|
||||
runtime CPUID update, and because there are more supported XSAVE features
|
||||
(CPUID for supported XSAVE feature sub-leafs is significantly slower).
|
||||
|
||||
SKX:
|
||||
CPUID.0xD.2 = 348 cycles
|
||||
CPUID.0xD.3 = 400 cycles
|
||||
CPUID.0xD.4 = 276 cycles
|
||||
CPUID.0xD.5 = 236 cycles
|
||||
<other sub-leaves are similar>
|
||||
|
||||
EMR:
|
||||
CPUID.0xD.2 = 1138 cycles
|
||||
CPUID.0xD.3 = 1362 cycles
|
||||
CPUID.0xD.4 = 1068 cycles
|
||||
CPUID.0xD.5 = 910 cycles
|
||||
CPUID.0xD.6 = 914 cycles
|
||||
CPUID.0xD.7 = 1350 cycles
|
||||
CPUID.0xD.8 = 734 cycles
|
||||
CPUID.0xD.9 = 766 cycles
|
||||
CPUID.0xD.10 = 732 cycles
|
||||
CPUID.0xD.11 = 718 cycles
|
||||
CPUID.0xD.12 = 734 cycles
|
||||
CPUID.0xD.13 = 1700 cycles
|
||||
CPUID.0xD.14 = 1126 cycles
|
||||
CPUID.0xD.15 = 898 cycles
|
||||
CPUID.0xD.16 = 716 cycles
|
||||
CPUID.0xD.17 = 748 cycles
|
||||
CPUID.0xD.18 = 776 cycles
|
||||
|
||||
Note, updating runtime CPUID information multiple times per nested
|
||||
transition is itself a flaw, especially since CPUID is a mandotory
|
||||
intercept on both Intel and AMD. E.g. KVM doesn't need to ensure emulated
|
||||
CPUID state is up-to-date while running L2. That flaw will be fixed in a
|
||||
future patch, as deferring runtime CPUID updates is more subtle than it
|
||||
appears at first glance, the benefits aren't super critical to have once
|
||||
the XSAVE issue is resolved, and caching CPUID output is desirable even if
|
||||
KVM's updates are deferred.
|
||||
|
||||
Cc: Jim Mattson <jmattson@google.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Sean Christopherson <seanjc@google.com>
|
||||
Message-ID: <20241211013302.1347853-2-seanjc@google.com>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
(cherry picked from commit 1201f226c863b7da739f7420ddba818cedf372fc)
|
||||
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
|
||||
---
|
||||
arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++++++++++-----
|
||||
arch/x86/kvm/cpuid.h | 1 +
|
||||
arch/x86/kvm/x86.c | 2 ++
|
||||
3 files changed, 29 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
|
||||
index b6892645c4cde238923062e3f71df21d68cc201d..dc73965aa73b21d26b4cf039336da3ca38e89bc6 100644
|
||||
--- a/arch/x86/kvm/cpuid.c
|
||||
+++ b/arch/x86/kvm/cpuid.c
|
||||
@@ -36,6 +36,26 @@
|
||||
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
|
||||
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
|
||||
|
||||
+struct cpuid_xstate_sizes {
|
||||
+ u32 eax;
|
||||
+ u32 ebx;
|
||||
+ u32 ecx;
|
||||
+};
|
||||
+
|
||||
+static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init;
|
||||
+
|
||||
+void __init kvm_init_xstate_sizes(void)
|
||||
+{
|
||||
+ u32 ign;
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) {
|
||||
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
|
||||
+
|
||||
+ cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
u32 xstate_required_size(u64 xstate_bv, bool compacted)
|
||||
{
|
||||
int feature_bit = 0;
|
||||
@@ -44,14 +64,15 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
|
||||
xstate_bv &= XFEATURE_MASK_EXTEND;
|
||||
while (xstate_bv) {
|
||||
if (xstate_bv & 0x1) {
|
||||
- u32 eax, ebx, ecx, edx, offset;
|
||||
- cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
|
||||
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit];
|
||||
+ u32 offset;
|
||||
+
|
||||
/* ECX[1]: 64B alignment in compacted form */
|
||||
if (compacted)
|
||||
- offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
|
||||
+ offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
|
||||
else
|
||||
- offset = ebx;
|
||||
- ret = max(ret, offset + eax);
|
||||
+ offset = xs->ebx;
|
||||
+ ret = max(ret, offset + xs->eax);
|
||||
}
|
||||
|
||||
xstate_bv >>= 1;
|
||||
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
|
||||
index 0e46c4555311d02b6ec9eba7f33b292c3562b0ee..6287ec33feb7b143a10a7d8ee38d74381a567d24 100644
|
||||
--- a/arch/x86/kvm/cpuid.h
|
||||
+++ b/arch/x86/kvm/cpuid.h
|
||||
@@ -34,6 +34,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
|
||||
|
||||
bool vcpu_supports_xsave_pkru(struct kvm_vcpu *vcpu);
|
||||
|
||||
+void __init kvm_init_xstate_sizes(void);
|
||||
u32 xstate_required_size(u64 xstate_bv, bool compacted);
|
||||
|
||||
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
|
||||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
|
||||
index c7c3d04198c6c7f88f97e122bfe78b857f26cb26..4d10fc1a9b4114d1e2edf133717f307043560263 100644
|
||||
--- a/arch/x86/kvm/x86.c
|
||||
+++ b/arch/x86/kvm/x86.c
|
||||
@@ -14061,6 +14061,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
|
||||
|
||||
static int __init kvm_x86_init(void)
|
||||
{
|
||||
+ kvm_init_xstate_sizes();
|
||||
+
|
||||
kvm_mmu_x86_module_init();
|
||||
mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
|
||||
return 0;
|
Loading…
Reference in New Issue
Block a user