mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-27 15:36:48 +00:00
Merge branch 'kvm-tdx-finish-initial' into HEAD
This patch ties the remaining loose ends and finally enables TDX guests to run inside KVM. It implements handling of EPT violation/misconfig and of several TDVMCALL leaves that are handled in the kernel (CPUID, HLT, RDMSR/WRMSR, GetTdVmCallInfo); it also adds a bunch of wrappers in vmx/main.c to ignore operations not supported by TDX guests(*) Finally, it introduces documentation for the new APIs that have been added along the way. (*) access to CPU state, VMX preemption timer, accesses to TSC offset or multiplier, LMCE enable/disable, hypercall patching.
This commit is contained in:
commit
7bcf7246c4
@ -1407,6 +1407,9 @@ the memory region are automatically reflected into the guest. For example, an
|
||||
mmap() that affects the region will be made visible immediately. Another
|
||||
example is madvise(MADV_DROP).
|
||||
|
||||
For TDX guest, deleting/moving memory region loses guest memory contents.
|
||||
Read only region isn't supported. Only as-id 0 is supported.
|
||||
|
||||
Note: On arm64, a write generated by the page-table walker (to update
|
||||
the Access and Dirty flags, for example) never results in a
|
||||
KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This
|
||||
@ -4764,7 +4767,7 @@ H_GET_CPU_CHARACTERISTICS hypercall.
|
||||
|
||||
:Capability: basic
|
||||
:Architectures: x86
|
||||
:Type: vm
|
||||
:Type: vm ioctl, vcpu ioctl
|
||||
:Parameters: an opaque platform specific structure (in/out)
|
||||
:Returns: 0 on success; -1 on error
|
||||
|
||||
@ -4772,9 +4775,11 @@ If the platform supports creating encrypted VMs then this ioctl can be used
|
||||
for issuing platform-specific memory encryption commands to manage those
|
||||
encrypted VMs.
|
||||
|
||||
Currently, this ioctl is used for issuing Secure Encrypted Virtualization
|
||||
(SEV) commands on AMD Processors. The SEV commands are defined in
|
||||
Documentation/virt/kvm/x86/amd-memory-encryption.rst.
|
||||
Currently, this ioctl is used for issuing both Secure Encrypted Virtualization
|
||||
(SEV) commands on AMD Processors and Trusted Domain Extensions (TDX) commands
|
||||
on Intel Processors. The detailed commands are defined in
|
||||
Documentation/virt/kvm/x86/amd-memory-encryption.rst and
|
||||
Documentation/virt/kvm/x86/intel-tdx.rst.
|
||||
|
||||
4.111 KVM_MEMORY_ENCRYPT_REG_REGION
|
||||
-----------------------------------
|
||||
@ -8160,6 +8165,28 @@ KVM_X86_QUIRK_STUFF_FEATURE_MSRS By default, at vCPU creation, KVM sets the
|
||||
and 0x489), as KVM does now allow them to
|
||||
be set by userspace (KVM sets them based on
|
||||
guest CPUID, for safety purposes).
|
||||
|
||||
KVM_X86_QUIRK_IGNORE_GUEST_PAT By default, on Intel platforms, KVM ignores
|
||||
guest PAT and forces the effective memory
|
||||
type to WB in EPT. The quirk is not available
|
||||
on Intel platforms which are incapable of
|
||||
safely honoring guest PAT (i.e., without CPU
|
||||
self-snoop, KVM always ignores guest PAT and
|
||||
forces effective memory type to WB). It is
|
||||
also ignored on AMD platforms or, on Intel,
|
||||
when a VM has non-coherent DMA devices
|
||||
assigned; KVM always honors guest PAT in
|
||||
such case. The quirk is needed to avoid
|
||||
slowdowns on certain Intel Xeon platforms
|
||||
(e.g. ICX, SPR) where self-snoop feature is
|
||||
supported but UC is slow enough to cause
|
||||
issues with some older guests that use
|
||||
UC instead of WC to map the video RAM.
|
||||
Userspace can disable the quirk to honor
|
||||
guest PAT if it knows that there is no such
|
||||
guest software, for example if it does not
|
||||
expose a bochs graphics device (which is
|
||||
known to have had a buggy driver).
|
||||
=================================== ============================================
|
||||
|
||||
7.32 KVM_CAP_MAX_VCPU_ID
|
||||
|
@ -11,6 +11,7 @@ KVM for x86 systems
|
||||
cpuid
|
||||
errata
|
||||
hypercalls
|
||||
intel-tdx
|
||||
mmu
|
||||
msr
|
||||
nested-vmx
|
||||
|
255
Documentation/virt/kvm/x86/intel-tdx.rst
Normal file
255
Documentation/virt/kvm/x86/intel-tdx.rst
Normal file
@ -0,0 +1,255 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===================================
|
||||
Intel Trust Domain Extensions (TDX)
|
||||
===================================
|
||||
|
||||
Overview
|
||||
========
|
||||
Intel's Trust Domain Extensions (TDX) protect confidential guest VMs from the
|
||||
host and physical attacks. A CPU-attested software module called 'the TDX
|
||||
module' runs inside a new CPU isolated range to provide the functionalities to
|
||||
manage and run protected VMs, a.k.a, TDX guests or TDs.
|
||||
|
||||
Please refer to [1] for the whitepaper, specifications and other resources.
|
||||
|
||||
This documentation describes TDX-specific KVM ABIs. The TDX module needs to be
|
||||
initialized before it can be used by KVM to run any TDX guests. The host
|
||||
core-kernel provides the support of initializing the TDX module, which is
|
||||
described in the Documentation/arch/x86/tdx.rst.
|
||||
|
||||
API description
|
||||
===============
|
||||
|
||||
KVM_MEMORY_ENCRYPT_OP
|
||||
---------------------
|
||||
:Type: vm ioctl, vcpu ioctl
|
||||
|
||||
For TDX operations, KVM_MEMORY_ENCRYPT_OP is re-purposed to be generic
|
||||
ioctl with TDX specific sub-ioctl() commands.
|
||||
|
||||
::
|
||||
|
||||
/* Trust Domain Extensions sub-ioctl() commands. */
|
||||
enum kvm_tdx_cmd_id {
|
||||
KVM_TDX_CAPABILITIES = 0,
|
||||
KVM_TDX_INIT_VM,
|
||||
KVM_TDX_INIT_VCPU,
|
||||
KVM_TDX_INIT_MEM_REGION,
|
||||
KVM_TDX_FINALIZE_VM,
|
||||
KVM_TDX_GET_CPUID,
|
||||
|
||||
KVM_TDX_CMD_NR_MAX,
|
||||
};
|
||||
|
||||
struct kvm_tdx_cmd {
|
||||
/* enum kvm_tdx_cmd_id */
|
||||
__u32 id;
|
||||
/* flags for sub-command. If sub-command doesn't use this, set zero. */
|
||||
__u32 flags;
|
||||
/*
|
||||
* data for each sub-command. An immediate or a pointer to the actual
|
||||
* data in process virtual address. If sub-command doesn't use it,
|
||||
* set zero.
|
||||
*/
|
||||
__u64 data;
|
||||
/*
|
||||
* Auxiliary error code. The sub-command may return TDX SEAMCALL
|
||||
* status code in addition to -Exxx.
|
||||
*/
|
||||
__u64 hw_error;
|
||||
};
|
||||
|
||||
KVM_TDX_CAPABILITIES
|
||||
--------------------
|
||||
:Type: vm ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Return the TDX capabilities that current KVM supports with the specific TDX
|
||||
module loaded in the system. It reports what features/capabilities are allowed
|
||||
to be configured to the TDX guest.
|
||||
|
||||
- id: KVM_TDX_CAPABILITIES
|
||||
- flags: must be 0
|
||||
- data: pointer to struct kvm_tdx_capabilities
|
||||
- hw_error: must be 0
|
||||
|
||||
::
|
||||
|
||||
struct kvm_tdx_capabilities {
|
||||
__u64 supported_attrs;
|
||||
__u64 supported_xfam;
|
||||
__u64 reserved[254];
|
||||
|
||||
/* Configurable CPUID bits for userspace */
|
||||
struct kvm_cpuid2 cpuid;
|
||||
};
|
||||
|
||||
|
||||
KVM_TDX_INIT_VM
|
||||
---------------
|
||||
:Type: vm ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Perform TDX specific VM initialization. This needs to be called after
|
||||
KVM_CREATE_VM and before creating any VCPUs.
|
||||
|
||||
- id: KVM_TDX_INIT_VM
|
||||
- flags: must be 0
|
||||
- data: pointer to struct kvm_tdx_init_vm
|
||||
- hw_error: must be 0
|
||||
|
||||
::
|
||||
|
||||
struct kvm_tdx_init_vm {
|
||||
__u64 attributes;
|
||||
__u64 xfam;
|
||||
__u64 mrconfigid[6]; /* sha384 digest */
|
||||
__u64 mrowner[6]; /* sha384 digest */
|
||||
__u64 mrownerconfig[6]; /* sha384 digest */
|
||||
|
||||
/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
|
||||
__u64 reserved[12];
|
||||
|
||||
/*
|
||||
* Call KVM_TDX_INIT_VM before vcpu creation, thus before
|
||||
* KVM_SET_CPUID2.
|
||||
* This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
|
||||
* TDX module directly virtualizes those CPUIDs without VMM. The user
|
||||
* space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
|
||||
* those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
|
||||
* the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
|
||||
* module doesn't virtualize.
|
||||
*/
|
||||
struct kvm_cpuid2 cpuid;
|
||||
};
|
||||
|
||||
|
||||
KVM_TDX_INIT_VCPU
|
||||
-----------------
|
||||
:Type: vcpu ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Perform TDX specific VCPU initialization.
|
||||
|
||||
- id: KVM_TDX_INIT_VCPU
|
||||
- flags: must be 0
|
||||
- data: initial value of the guest TD VCPU RCX
|
||||
- hw_error: must be 0
|
||||
|
||||
KVM_TDX_INIT_MEM_REGION
|
||||
-----------------------
|
||||
:Type: vcpu ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
|
||||
provided data from @source_addr.
|
||||
|
||||
Note, before calling this sub command, memory attribute of the range
|
||||
[gpa, gpa + nr_pages] needs to be private. Userspace can use
|
||||
KVM_SET_MEMORY_ATTRIBUTES to set the attribute.
|
||||
|
||||
If KVM_TDX_MEASURE_MEMORY_REGION flag is specified, it also extends measurement.
|
||||
|
||||
- id: KVM_TDX_INIT_MEM_REGION
|
||||
- flags: currently only KVM_TDX_MEASURE_MEMORY_REGION is defined
|
||||
- data: pointer to struct kvm_tdx_init_mem_region
|
||||
- hw_error: must be 0
|
||||
|
||||
::
|
||||
|
||||
#define KVM_TDX_MEASURE_MEMORY_REGION (1UL << 0)
|
||||
|
||||
struct kvm_tdx_init_mem_region {
|
||||
__u64 source_addr;
|
||||
__u64 gpa;
|
||||
__u64 nr_pages;
|
||||
};
|
||||
|
||||
|
||||
KVM_TDX_FINALIZE_VM
|
||||
-------------------
|
||||
:Type: vm ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Complete measurement of the initial TD contents and mark it ready to run.
|
||||
|
||||
- id: KVM_TDX_FINALIZE_VM
|
||||
- flags: must be 0
|
||||
- data: must be 0
|
||||
- hw_error: must be 0
|
||||
|
||||
|
||||
KVM_TDX_GET_CPUID
|
||||
-----------------
|
||||
:Type: vcpu ioctl
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Get the CPUID values that the TDX module virtualizes for the TD guest.
|
||||
When it returns -E2BIG, the user space should allocate a larger buffer and
|
||||
retry. The minimum buffer size is updated in the nent field of the
|
||||
struct kvm_cpuid2.
|
||||
|
||||
- id: KVM_TDX_GET_CPUID
|
||||
- flags: must be 0
|
||||
- data: pointer to struct kvm_cpuid2 (in/out)
|
||||
- hw_error: must be 0 (out)
|
||||
|
||||
::
|
||||
|
||||
struct kvm_cpuid2 {
|
||||
__u32 nent;
|
||||
__u32 padding;
|
||||
struct kvm_cpuid_entry2 entries[0];
|
||||
};
|
||||
|
||||
struct kvm_cpuid_entry2 {
|
||||
__u32 function;
|
||||
__u32 index;
|
||||
__u32 flags;
|
||||
__u32 eax;
|
||||
__u32 ebx;
|
||||
__u32 ecx;
|
||||
__u32 edx;
|
||||
__u32 padding[3];
|
||||
};
|
||||
|
||||
KVM TDX creation flow
|
||||
=====================
|
||||
In addition to the standard KVM flow, new TDX ioctls need to be called. The
|
||||
control flow is as follows:
|
||||
|
||||
#. Check system wide capability
|
||||
|
||||
* KVM_CAP_VM_TYPES: Check if VM type is supported and if KVM_X86_TDX_VM
|
||||
is supported.
|
||||
|
||||
#. Create VM
|
||||
|
||||
* KVM_CREATE_VM
|
||||
* KVM_TDX_CAPABILITIES: Query TDX capabilities for creating TDX guests.
|
||||
* KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPUS): Query maximum VCPUs the TD can
|
||||
support at VM level (TDX has its own limitation on this).
|
||||
* KVM_SET_TSC_KHZ: Configure TD's TSC frequency if a different TSC frequency
|
||||
than host is desired. This is Optional.
|
||||
* KVM_TDX_INIT_VM: Pass TDX specific VM parameters.
|
||||
|
||||
#. Create VCPU
|
||||
|
||||
* KVM_CREATE_VCPU
|
||||
* KVM_TDX_INIT_VCPU: Pass TDX specific VCPU parameters.
|
||||
* KVM_SET_CPUID2: Configure TD's CPUIDs.
|
||||
* KVM_SET_MSRS: Configure TD's MSRs.
|
||||
|
||||
#. Initialize initial guest memory
|
||||
|
||||
* Prepare content of initial guest memory.
|
||||
* KVM_TDX_INIT_MEM_REGION: Add initial guest memory.
|
||||
* KVM_TDX_FINALIZE_VM: Finalize the measurement of the TDX guest.
|
||||
|
||||
#. Run VCPU
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
.. [1] https://www.intel.com/content/www/us/en/developer/tools/trust-domain-extensions/documentation.html
|
@ -2420,7 +2420,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
|
||||
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
|
||||
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
|
||||
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
|
||||
KVM_X86_QUIRK_STUFF_FEATURE_MSRS)
|
||||
KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
|
||||
KVM_X86_QUIRK_IGNORE_GUEST_PAT)
|
||||
|
||||
#define KVM_X86_CONDITIONAL_QUIRKS \
|
||||
(KVM_X86_QUIRK_CD_NW_CLEARED | \
|
||||
KVM_X86_QUIRK_IGNORE_GUEST_PAT)
|
||||
|
||||
/*
|
||||
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
|
||||
|
@ -67,6 +67,7 @@
|
||||
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
|
||||
|
||||
/* TDX hypercall Leaf IDs */
|
||||
#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000
|
||||
#define TDVMCALL_MAP_GPA 0x10001
|
||||
#define TDVMCALL_GET_QUOTE 0x10002
|
||||
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
|
||||
|
@ -585,12 +585,14 @@ enum vm_entry_failure_code {
|
||||
#define EPT_VIOLATION_ACC_WRITE_BIT 1
|
||||
#define EPT_VIOLATION_ACC_INSTR_BIT 2
|
||||
#define EPT_VIOLATION_RWX_SHIFT 3
|
||||
#define EPT_VIOLATION_EXEC_R3_LIN_BIT 6
|
||||
#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
|
||||
#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
|
||||
#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
|
||||
#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
|
||||
#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
|
||||
#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
|
||||
#define EPT_VIOLATION_EXEC_FOR_RING3_LIN (1 << EPT_VIOLATION_EXEC_R3_LIN_BIT)
|
||||
#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
|
||||
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
|
||||
|
||||
|
@ -441,6 +441,7 @@ struct kvm_sync_regs {
|
||||
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
|
||||
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
|
||||
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
|
||||
#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
|
||||
|
||||
#define KVM_STATE_NESTED_FORMAT_VMX 0
|
||||
#define KVM_STATE_NESTED_FORMAT_SVM 1
|
||||
|
@ -232,7 +232,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||
return -(u32)fault & errcode;
|
||||
}
|
||||
|
||||
bool kvm_mmu_may_ignore_guest_pat(void);
|
||||
bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm);
|
||||
|
||||
int kvm_mmu_post_init_vm(struct kvm *kvm);
|
||||
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
|
||||
|
@ -4663,19 +4663,6 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
|
||||
}
|
||||
#endif
|
||||
|
||||
bool kvm_mmu_may_ignore_guest_pat(void)
|
||||
{
|
||||
/*
|
||||
* When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
|
||||
* has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
|
||||
* honor the memtype from the guest's PAT so that guest accesses to
|
||||
* memory that is DMA'd aren't cached against the guest's wishes. As a
|
||||
* result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
|
||||
* KVM _always_ ignores guest PAT (when EPT is enabled).
|
||||
*/
|
||||
return shadow_memtype_mask;
|
||||
}
|
||||
|
||||
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
|
@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value;
|
||||
u64 __read_mostly shadow_mmio_mask;
|
||||
u64 __read_mostly shadow_mmio_access_mask;
|
||||
u64 __read_mostly shadow_present_mask;
|
||||
u64 __read_mostly shadow_memtype_mask;
|
||||
u64 __read_mostly shadow_me_value;
|
||||
u64 __read_mostly shadow_me_mask;
|
||||
u64 __read_mostly shadow_acc_track_mask;
|
||||
@ -203,9 +202,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
if (level > PG_LEVEL_4K)
|
||||
spte |= PT_PAGE_SIZE_MASK;
|
||||
|
||||
if (shadow_memtype_mask)
|
||||
spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
|
||||
kvm_is_mmio_pfn(pfn));
|
||||
spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn));
|
||||
if (host_writable)
|
||||
spte |= shadow_host_writable_mask;
|
||||
else
|
||||
@ -460,13 +457,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
|
||||
/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
|
||||
shadow_present_mask =
|
||||
(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
|
||||
/*
|
||||
* EPT overrides the host MTRRs, and so KVM must program the desired
|
||||
* memtype directly into the SPTEs. Note, this mask is just the mask
|
||||
* of all bits that factor into the memtype, the actual memtype must be
|
||||
* dynamically calculated, e.g. to ensure host MMIO is mapped UC.
|
||||
*/
|
||||
shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
|
||||
|
||||
shadow_acc_track_mask = VMX_EPT_RWX_MASK;
|
||||
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
|
||||
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
|
||||
@ -518,12 +509,6 @@ void kvm_mmu_reset_all_pte_masks(void)
|
||||
shadow_x_mask = 0;
|
||||
shadow_present_mask = PT_PRESENT_MASK;
|
||||
|
||||
/*
|
||||
* For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
|
||||
* memtype in the SPTEs, i.e. relies on host MTRRs to provide the
|
||||
* correct memtype (WB is the "weakest" memtype).
|
||||
*/
|
||||
shadow_memtype_mask = 0;
|
||||
shadow_acc_track_mask = 0;
|
||||
shadow_me_mask = 0;
|
||||
shadow_me_value = 0;
|
||||
|
@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value;
|
||||
extern u64 __read_mostly shadow_mmio_mask;
|
||||
extern u64 __read_mostly shadow_mmio_access_mask;
|
||||
extern u64 __read_mostly shadow_present_mask;
|
||||
extern u64 __read_mostly shadow_memtype_mask;
|
||||
extern u64 __read_mostly shadow_me_value;
|
||||
extern u64 __read_mostly shadow_me_mask;
|
||||
|
||||
|
@ -5472,6 +5472,7 @@ static __init int svm_hardware_setup(void)
|
||||
*/
|
||||
allow_smaller_maxphyaddr = !npt_enabled;
|
||||
|
||||
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
|
@ -193,6 +193,56 @@ static int vt_handle_exit(struct kvm_vcpu *vcpu,
|
||||
return vmx_handle_exit(vcpu, fastpath);
|
||||
}
|
||||
|
||||
static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
||||
{
|
||||
if (unlikely(is_td_vcpu(vcpu)))
|
||||
return tdx_set_msr(vcpu, msr_info);
|
||||
|
||||
return vmx_set_msr(vcpu, msr_info);
|
||||
}
|
||||
|
||||
/*
|
||||
* The kvm parameter can be NULL (module initialization, or invocation before
|
||||
* VM creation). Be sure to check the kvm parameter before using it.
|
||||
*/
|
||||
static bool vt_has_emulated_msr(struct kvm *kvm, u32 index)
|
||||
{
|
||||
if (kvm && is_td(kvm))
|
||||
return tdx_has_emulated_msr(index);
|
||||
|
||||
return vmx_has_emulated_msr(kvm, index);
|
||||
}
|
||||
|
||||
static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
||||
{
|
||||
if (unlikely(is_td_vcpu(vcpu)))
|
||||
return tdx_get_msr(vcpu, msr_info);
|
||||
|
||||
return vmx_get_msr(vcpu, msr_info);
|
||||
}
|
||||
|
||||
static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* TDX doesn't allow VMM to configure interception of MSR accesses.
|
||||
* TDX guest requests MSR accesses by calling TDVMCALL. The MSR
|
||||
* filters will be applied when handling the TDVMCALL for RDMSR/WRMSR
|
||||
* if the userspace has set any.
|
||||
*/
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_msr_filter_changed(vcpu);
|
||||
}
|
||||
|
||||
static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return tdx_complete_emulated_msr(vcpu, err);
|
||||
|
||||
return kvm_complete_insn_gp(vcpu, err);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
|
||||
{
|
||||
@ -228,6 +278,22 @@ static void vt_enable_smi_window(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
#endif
|
||||
|
||||
static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
|
||||
void *insn, int insn_len)
|
||||
{
|
||||
/*
|
||||
* For TDX, this can only be triggered for MMIO emulation. Let the
|
||||
* guest retry after installing the SPTE with suppress #VE bit cleared,
|
||||
* so that the guest will receive #VE when retry. The guest is expected
|
||||
* to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on
|
||||
* #VE.
|
||||
*/
|
||||
if (is_td_vcpu(vcpu))
|
||||
return X86EMUL_RETRY_INSTR;
|
||||
|
||||
return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len);
|
||||
}
|
||||
|
||||
static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
@ -285,6 +351,214 @@ static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
|
||||
vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector);
|
||||
}
|
||||
|
||||
static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_vcpu_after_set_cpuid(vcpu);
|
||||
}
|
||||
|
||||
static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_update_exception_bitmap(vcpu);
|
||||
}
|
||||
|
||||
static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_segment_base(vcpu, seg);
|
||||
}
|
||||
|
||||
static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
|
||||
int seg)
|
||||
{
|
||||
if (is_td_vcpu(vcpu)) {
|
||||
memset(var, 0, sizeof(*var));
|
||||
return;
|
||||
}
|
||||
|
||||
vmx_get_segment(vcpu, var, seg);
|
||||
}
|
||||
|
||||
static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
|
||||
int seg)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_segment(vcpu, var, seg);
|
||||
}
|
||||
|
||||
static int vt_get_cpl(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_cpl(vcpu);
|
||||
}
|
||||
|
||||
static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_cpl_no_cache(vcpu);
|
||||
}
|
||||
|
||||
static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
|
||||
{
|
||||
if (is_td_vcpu(vcpu)) {
|
||||
*db = 0;
|
||||
*l = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
vmx_get_cs_db_l_bits(vcpu, db, l);
|
||||
}
|
||||
|
||||
static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return true;
|
||||
|
||||
return vmx_is_valid_cr0(vcpu, cr0);
|
||||
}
|
||||
|
||||
static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_cr0(vcpu, cr0);
|
||||
}
|
||||
|
||||
static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return true;
|
||||
|
||||
return vmx_is_valid_cr4(vcpu, cr4);
|
||||
}
|
||||
|
||||
static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_cr4(vcpu, cr4);
|
||||
}
|
||||
|
||||
static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_set_efer(vcpu, efer);
|
||||
}
|
||||
|
||||
static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
|
||||
{
|
||||
if (is_td_vcpu(vcpu)) {
|
||||
memset(dt, 0, sizeof(*dt));
|
||||
return;
|
||||
}
|
||||
|
||||
vmx_get_idt(vcpu, dt);
|
||||
}
|
||||
|
||||
static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_idt(vcpu, dt);
|
||||
}
|
||||
|
||||
static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
|
||||
{
|
||||
if (is_td_vcpu(vcpu)) {
|
||||
memset(dt, 0, sizeof(*dt));
|
||||
return;
|
||||
}
|
||||
|
||||
vmx_get_gdt(vcpu, dt);
|
||||
}
|
||||
|
||||
static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_gdt(vcpu, dt);
|
||||
}
|
||||
|
||||
static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_dr6(vcpu, val);
|
||||
}
|
||||
|
||||
static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_dr7(vcpu, val);
|
||||
}
|
||||
|
||||
static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* MOV-DR exiting is always cleared for TD guest, even in debug mode.
|
||||
* Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never
|
||||
* reach here for TD vcpu.
|
||||
*/
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_sync_dirty_debug_regs(vcpu);
|
||||
}
|
||||
|
||||
static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
|
||||
{
|
||||
if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
|
||||
return;
|
||||
|
||||
vmx_cache_reg(vcpu, reg);
|
||||
}
|
||||
|
||||
static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_rflags(vcpu);
|
||||
}
|
||||
|
||||
static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_set_rflags(vcpu, rflags);
|
||||
}
|
||||
|
||||
static bool vt_get_if_flag(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return false;
|
||||
|
||||
return vmx_get_if_flag(vcpu);
|
||||
}
|
||||
|
||||
static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu)) {
|
||||
@ -399,6 +673,19 @@ static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu)
|
||||
return vmx_get_interrupt_shadow(vcpu);
|
||||
}
|
||||
|
||||
static void vt_patch_hypercall(struct kvm_vcpu *vcpu,
|
||||
unsigned char *hypercall)
|
||||
{
|
||||
/*
|
||||
* Because guest memory is protected, guest can't be patched. TD kernel
|
||||
* is modified to use TDG.VP.VMCALL for hypercall.
|
||||
*/
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_patch_hypercall(vcpu, hypercall);
|
||||
}
|
||||
|
||||
static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
@ -407,6 +694,14 @@ static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
|
||||
vmx_inject_irq(vcpu, reinjected);
|
||||
}
|
||||
|
||||
static void vt_inject_exception(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_inject_exception(vcpu);
|
||||
}
|
||||
|
||||
static void vt_cancel_injection(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
@ -418,7 +713,7 @@ static void vt_cancel_injection(struct kvm_vcpu *vcpu)
|
||||
static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return true;
|
||||
return tdx_interrupt_allowed(vcpu);
|
||||
|
||||
return vmx_interrupt_allowed(vcpu, for_injection);
|
||||
}
|
||||
@ -454,6 +749,14 @@ static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
|
||||
vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code);
|
||||
}
|
||||
|
||||
static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_update_cr8_intercept(vcpu, tpr, irr);
|
||||
}
|
||||
|
||||
static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
@ -472,6 +775,95 @@ static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
|
||||
vmx_refresh_apicv_exec_ctrl(vcpu);
|
||||
}
|
||||
|
||||
static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap);
|
||||
}
|
||||
|
||||
static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr)
|
||||
{
|
||||
if (is_td(kvm))
|
||||
return 0;
|
||||
|
||||
return vmx_set_tss_addr(kvm, addr);
|
||||
}
|
||||
|
||||
static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
|
||||
{
|
||||
if (is_td(kvm))
|
||||
return 0;
|
||||
|
||||
return vmx_set_identity_map_addr(kvm, ident_addr);
|
||||
}
|
||||
|
||||
static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* TDX doesn't support L2 guest at the moment. */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_l2_tsc_offset(vcpu);
|
||||
}
|
||||
|
||||
static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* TDX doesn't support L2 guest at the moment. */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
return vmx_get_l2_tsc_multiplier(vcpu);
|
||||
}
|
||||
|
||||
static void vt_write_tsc_offset(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* In TDX, tsc offset can't be changed. */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_write_tsc_offset(vcpu);
|
||||
}
|
||||
|
||||
static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* In TDX, tsc multiplier can't be changed. */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_write_tsc_multiplier(vcpu);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
|
||||
bool *expired)
|
||||
{
|
||||
/* VMX-preemption timer isn't available for TDX. */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return -EINVAL;
|
||||
|
||||
return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired);
|
||||
}
|
||||
|
||||
static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* VMX-preemption timer can't be set. See vt_set_hv_timer(). */
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_cancel_hv_timer(vcpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void vt_setup_mce(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (is_td_vcpu(vcpu))
|
||||
return;
|
||||
|
||||
vmx_setup_mce(vcpu);
|
||||
}
|
||||
|
||||
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
|
||||
{
|
||||
if (!is_td(kvm))
|
||||
@ -516,7 +908,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.disable_virtualization_cpu = vt_disable_virtualization_cpu,
|
||||
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
|
||||
|
||||
.has_emulated_msr = vmx_has_emulated_msr,
|
||||
.has_emulated_msr = vt_has_emulated_msr,
|
||||
|
||||
.vm_size = sizeof(struct kvm_vmx),
|
||||
|
||||
@ -533,32 +925,33 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.vcpu_load = vt_vcpu_load,
|
||||
.vcpu_put = vt_vcpu_put,
|
||||
|
||||
.update_exception_bitmap = vmx_update_exception_bitmap,
|
||||
.update_exception_bitmap = vt_update_exception_bitmap,
|
||||
.get_feature_msr = vmx_get_feature_msr,
|
||||
.get_msr = vmx_get_msr,
|
||||
.set_msr = vmx_set_msr,
|
||||
.get_segment_base = vmx_get_segment_base,
|
||||
.get_segment = vmx_get_segment,
|
||||
.set_segment = vmx_set_segment,
|
||||
.get_cpl = vmx_get_cpl,
|
||||
.get_cpl_no_cache = vmx_get_cpl_no_cache,
|
||||
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
|
||||
.is_valid_cr0 = vmx_is_valid_cr0,
|
||||
.set_cr0 = vmx_set_cr0,
|
||||
.is_valid_cr4 = vmx_is_valid_cr4,
|
||||
.set_cr4 = vmx_set_cr4,
|
||||
.set_efer = vmx_set_efer,
|
||||
.get_idt = vmx_get_idt,
|
||||
.set_idt = vmx_set_idt,
|
||||
.get_gdt = vmx_get_gdt,
|
||||
.set_gdt = vmx_set_gdt,
|
||||
.set_dr6 = vmx_set_dr6,
|
||||
.set_dr7 = vmx_set_dr7,
|
||||
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
|
||||
.cache_reg = vmx_cache_reg,
|
||||
.get_rflags = vmx_get_rflags,
|
||||
.set_rflags = vmx_set_rflags,
|
||||
.get_if_flag = vmx_get_if_flag,
|
||||
.get_msr = vt_get_msr,
|
||||
.set_msr = vt_set_msr,
|
||||
|
||||
.get_segment_base = vt_get_segment_base,
|
||||
.get_segment = vt_get_segment,
|
||||
.set_segment = vt_set_segment,
|
||||
.get_cpl = vt_get_cpl,
|
||||
.get_cpl_no_cache = vt_get_cpl_no_cache,
|
||||
.get_cs_db_l_bits = vt_get_cs_db_l_bits,
|
||||
.is_valid_cr0 = vt_is_valid_cr0,
|
||||
.set_cr0 = vt_set_cr0,
|
||||
.is_valid_cr4 = vt_is_valid_cr4,
|
||||
.set_cr4 = vt_set_cr4,
|
||||
.set_efer = vt_set_efer,
|
||||
.get_idt = vt_get_idt,
|
||||
.set_idt = vt_set_idt,
|
||||
.get_gdt = vt_get_gdt,
|
||||
.set_gdt = vt_set_gdt,
|
||||
.set_dr6 = vt_set_dr6,
|
||||
.set_dr7 = vt_set_dr7,
|
||||
.sync_dirty_debug_regs = vt_sync_dirty_debug_regs,
|
||||
.cache_reg = vt_cache_reg,
|
||||
.get_rflags = vt_get_rflags,
|
||||
.set_rflags = vt_set_rflags,
|
||||
.get_if_flag = vt_get_if_flag,
|
||||
|
||||
.flush_tlb_all = vt_flush_tlb_all,
|
||||
.flush_tlb_current = vt_flush_tlb_current,
|
||||
@ -572,10 +965,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.update_emulated_instruction = vmx_update_emulated_instruction,
|
||||
.set_interrupt_shadow = vt_set_interrupt_shadow,
|
||||
.get_interrupt_shadow = vt_get_interrupt_shadow,
|
||||
.patch_hypercall = vmx_patch_hypercall,
|
||||
.patch_hypercall = vt_patch_hypercall,
|
||||
.inject_irq = vt_inject_irq,
|
||||
.inject_nmi = vt_inject_nmi,
|
||||
.inject_exception = vmx_inject_exception,
|
||||
.inject_exception = vt_inject_exception,
|
||||
.cancel_injection = vt_cancel_injection,
|
||||
.interrupt_allowed = vt_interrupt_allowed,
|
||||
.nmi_allowed = vt_nmi_allowed,
|
||||
@ -583,13 +976,13 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.set_nmi_mask = vt_set_nmi_mask,
|
||||
.enable_nmi_window = vt_enable_nmi_window,
|
||||
.enable_irq_window = vt_enable_irq_window,
|
||||
.update_cr8_intercept = vmx_update_cr8_intercept,
|
||||
.update_cr8_intercept = vt_update_cr8_intercept,
|
||||
|
||||
.x2apic_icr_is_split = false,
|
||||
.set_virtual_apic_mode = vt_set_virtual_apic_mode,
|
||||
.set_apic_access_page_addr = vt_set_apic_access_page_addr,
|
||||
.refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl,
|
||||
.load_eoi_exitmap = vmx_load_eoi_exitmap,
|
||||
.load_eoi_exitmap = vt_load_eoi_exitmap,
|
||||
.apicv_pre_state_restore = vt_apicv_pre_state_restore,
|
||||
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
|
||||
.hwapic_isr_update = vt_hwapic_isr_update,
|
||||
@ -597,21 +990,21 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.deliver_interrupt = vt_deliver_interrupt,
|
||||
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
|
||||
|
||||
.set_tss_addr = vmx_set_tss_addr,
|
||||
.set_identity_map_addr = vmx_set_identity_map_addr,
|
||||
.set_tss_addr = vt_set_tss_addr,
|
||||
.set_identity_map_addr = vt_set_identity_map_addr,
|
||||
.get_mt_mask = vmx_get_mt_mask,
|
||||
|
||||
.get_exit_info = vt_get_exit_info,
|
||||
.get_entry_info = vt_get_entry_info,
|
||||
|
||||
.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
|
||||
.vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid,
|
||||
|
||||
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
|
||||
|
||||
.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
|
||||
.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
|
||||
.write_tsc_offset = vmx_write_tsc_offset,
|
||||
.write_tsc_multiplier = vmx_write_tsc_multiplier,
|
||||
.get_l2_tsc_offset = vt_get_l2_tsc_offset,
|
||||
.get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier,
|
||||
.write_tsc_offset = vt_write_tsc_offset,
|
||||
.write_tsc_multiplier = vt_write_tsc_multiplier,
|
||||
|
||||
.load_mmu_pgd = vt_load_mmu_pgd,
|
||||
|
||||
@ -626,11 +1019,11 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.pi_start_assignment = vmx_pi_start_assignment,
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
.set_hv_timer = vmx_set_hv_timer,
|
||||
.cancel_hv_timer = vmx_cancel_hv_timer,
|
||||
.set_hv_timer = vt_set_hv_timer,
|
||||
.cancel_hv_timer = vt_cancel_hv_timer,
|
||||
#endif
|
||||
|
||||
.setup_mce = vmx_setup_mce,
|
||||
.setup_mce = vt_setup_mce,
|
||||
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
.smi_allowed = vt_smi_allowed,
|
||||
@ -639,12 +1032,12 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
||||
.enable_smi_window = vt_enable_smi_window,
|
||||
#endif
|
||||
|
||||
.check_emulate_instruction = vmx_check_emulate_instruction,
|
||||
.check_emulate_instruction = vt_check_emulate_instruction,
|
||||
.apic_init_signal_blocked = vt_apic_init_signal_blocked,
|
||||
.migrate_timers = vmx_migrate_timers,
|
||||
|
||||
.msr_filter_changed = vmx_msr_filter_changed,
|
||||
.complete_emulated_msr = kvm_complete_insn_gp,
|
||||
.msr_filter_changed = vt_msr_filter_changed,
|
||||
.complete_emulated_msr = vt_complete_emulated_msr,
|
||||
|
||||
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
|
||||
|
||||
@ -698,6 +1091,7 @@ static int __init vt_init(void)
|
||||
sizeof(struct vcpu_tdx));
|
||||
vcpu_align = max_t(unsigned, vcpu_align,
|
||||
__alignof__(struct vcpu_tdx));
|
||||
kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -203,7 +203,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
return;
|
||||
|
||||
if (kvm_vcpu_is_blocking(vcpu) &&
|
||||
(is_td_vcpu(vcpu) || !vmx_interrupt_blocked(vcpu)))
|
||||
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
|
||||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
|
||||
pi_enable_wakeup_handler(vcpu);
|
||||
|
||||
/*
|
||||
|
@ -295,6 +295,26 @@ static void tdx_clear_page(struct page *page)
|
||||
__mb();
|
||||
}
|
||||
|
||||
static void tdx_no_vcpus_enter_start(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
|
||||
|
||||
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
|
||||
}
|
||||
|
||||
static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
|
||||
}
|
||||
|
||||
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
|
||||
static int __tdx_reclaim_page(struct page *page)
|
||||
{
|
||||
@ -605,6 +625,7 @@ int tdx_vm_init(struct kvm *kvm)
|
||||
|
||||
kvm->arch.has_protected_state = true;
|
||||
kvm->arch.has_private_mem = true;
|
||||
kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
|
||||
|
||||
/*
|
||||
* Because guest TD is protected, VMM can't parse the instruction in TD.
|
||||
@ -706,9 +727,39 @@ void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* KVM can't get the interrupt status of TDX guest and it assumes
|
||||
* interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
|
||||
* which passes the interrupt blocked flag.
|
||||
*/
|
||||
return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
|
||||
!to_tdx(vcpu)->vp_enter_args.r12;
|
||||
}
|
||||
|
||||
bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return pi_has_pending_interrupt(vcpu);
|
||||
u64 vcpu_state_details;
|
||||
|
||||
if (pi_has_pending_interrupt(vcpu))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Only check RVI pending for HALTED case with IRQ enabled.
|
||||
* For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
|
||||
* interrupt was pending before TD exit, then it _must_ be blocked,
|
||||
* otherwise the interrupt would have been serviced at the instruction
|
||||
* boundary.
|
||||
*/
|
||||
if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
|
||||
to_tdx(vcpu)->vp_enter_args.r12)
|
||||
return false;
|
||||
|
||||
vcpu_state_details =
|
||||
td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
|
||||
|
||||
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -824,7 +875,11 @@ int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
|
||||
static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
switch (tdvmcall_leaf(vcpu)) {
|
||||
case EXIT_REASON_CPUID:
|
||||
case EXIT_REASON_HLT:
|
||||
case EXIT_REASON_IO_INSTRUCTION:
|
||||
case EXIT_REASON_MSR_READ:
|
||||
case EXIT_REASON_MSR_WRITE:
|
||||
return tdvmcall_leaf(vcpu);
|
||||
case EXIT_REASON_EPT_VIOLATION:
|
||||
return EXIT_REASON_EPT_MISCONFIG;
|
||||
@ -859,6 +914,12 @@ static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
|
||||
return EXIT_REASON_VMCALL;
|
||||
|
||||
return tdcall_to_vmx_exit_reason(vcpu);
|
||||
case EXIT_REASON_EPT_MISCONFIG:
|
||||
/*
|
||||
* Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
|
||||
* non-instrumentable code with interrupts disabled.
|
||||
*/
|
||||
return -1u;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -974,6 +1035,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
|
||||
*/
|
||||
WARN_ON_ONCE(force_immediate_exit);
|
||||
|
||||
/*
|
||||
* Wait until retry of SEPT-zap-related SEAMCALL completes before
|
||||
* allowing vCPU entry to avoid contention with tdh_vp_enter() and
|
||||
* TDCALLs.
|
||||
*/
|
||||
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
|
||||
return EXIT_FASTPATH_EXIT_HANDLED;
|
||||
|
||||
trace_kvm_entry(vcpu, force_immediate_exit);
|
||||
|
||||
if (pi_test_on(&vt->pi_desc)) {
|
||||
@ -994,6 +1063,9 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
|
||||
|
||||
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
|
||||
|
||||
if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
|
||||
return EXIT_FASTPATH_NONE;
|
||||
|
||||
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
|
||||
return EXIT_FASTPATH_NONE;
|
||||
|
||||
@ -1091,9 +1163,7 @@ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
|
||||
/*
|
||||
* Stop processing the remaining part if there is a pending interrupt,
|
||||
* which could be qualified to deliver. Skip checking pending RVI for
|
||||
* TDVMCALL_MAP_GPA.
|
||||
* TODO: Add a comment to link the reason when the target function is
|
||||
* implemented.
|
||||
* TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
|
||||
*/
|
||||
if (kvm_vcpu_has_events(vcpu)) {
|
||||
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
|
||||
@ -1201,6 +1271,25 @@ static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 eax, ebx, ecx, edx;
|
||||
struct vcpu_tdx *tdx = to_tdx(vcpu);
|
||||
|
||||
/* EAX and ECX for cpuid is stored in R12 and R13. */
|
||||
eax = tdx->vp_enter_args.r12;
|
||||
ecx = tdx->vp_enter_args.r13;
|
||||
|
||||
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
|
||||
|
||||
tdx->vp_enter_args.r12 = eax;
|
||||
tdx->vp_enter_args.r13 = ebx;
|
||||
tdx->vp_enter_args.r14 = ecx;
|
||||
tdx->vp_enter_args.r15 = edx;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
vcpu->arch.pio.count = 0;
|
||||
@ -1360,6 +1449,20 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_tdx *tdx = to_tdx(vcpu);
|
||||
|
||||
if (tdx->vp_enter_args.r12)
|
||||
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
|
||||
else {
|
||||
tdx->vp_enter_args.r11 = 0;
|
||||
tdx->vp_enter_args.r13 = 0;
|
||||
tdx->vp_enter_args.r14 = 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
switch (tdvmcall_leaf(vcpu)) {
|
||||
@ -1367,6 +1470,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
|
||||
return tdx_map_gpa(vcpu);
|
||||
case TDVMCALL_REPORT_FATAL_ERROR:
|
||||
return tdx_report_fatal_error(vcpu);
|
||||
case TDVMCALL_GET_TD_VM_CALL_INFO:
|
||||
return tdx_get_td_vm_call_info(vcpu);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -1484,15 +1589,24 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
|
||||
if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
/*
|
||||
* When zapping private page, write lock is held. So no race condition
|
||||
* with other vcpu sept operation.
|
||||
* Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
|
||||
*/
|
||||
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
|
||||
&level_state);
|
||||
|
||||
if (unlikely(tdx_operand_busy(err))) {
|
||||
/*
|
||||
* When zapping private page, write lock is held. So no race
|
||||
* condition with other vcpu sept operation. Race only with
|
||||
* TDH.VP.ENTER.
|
||||
* The second retry is expected to succeed after kicking off all
|
||||
* other vCPUs and prevent them from invoking TDH.VP.ENTER.
|
||||
*/
|
||||
tdx_no_vcpus_enter_start(kvm);
|
||||
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
|
||||
&level_state);
|
||||
} while (unlikely(tdx_operand_busy(err)));
|
||||
tdx_no_vcpus_enter_stop(kvm);
|
||||
}
|
||||
|
||||
if (KVM_BUG_ON(err, kvm)) {
|
||||
pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
|
||||
@ -1576,9 +1690,13 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
|
||||
WARN_ON_ONCE(level != PG_LEVEL_4K);
|
||||
|
||||
err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
|
||||
if (unlikely(tdx_operand_busy(err)))
|
||||
return -EBUSY;
|
||||
|
||||
if (unlikely(tdx_operand_busy(err))) {
|
||||
/* After no vCPUs enter, the second retry is expected to succeed */
|
||||
tdx_no_vcpus_enter_start(kvm);
|
||||
err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
|
||||
tdx_no_vcpus_enter_stop(kvm);
|
||||
}
|
||||
if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
|
||||
!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
|
||||
atomic64_dec(&kvm_tdx->nr_premapped);
|
||||
@ -1628,9 +1746,13 @@ static void tdx_track(struct kvm *kvm)
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
do {
|
||||
err = tdh_mem_track(&kvm_tdx->td);
|
||||
if (unlikely(tdx_operand_busy(err))) {
|
||||
/* After no vCPUs enter, the second retry is expected to succeed */
|
||||
tdx_no_vcpus_enter_start(kvm);
|
||||
err = tdh_mem_track(&kvm_tdx->td);
|
||||
} while (unlikely(tdx_operand_busy(err)));
|
||||
tdx_no_vcpus_enter_stop(kvm);
|
||||
}
|
||||
|
||||
if (KVM_BUG_ON(err, kvm))
|
||||
pr_tdx_error(TDH_MEM_TRACK, err);
|
||||
@ -1700,6 +1822,123 @@ void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
|
||||
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
|
||||
}
|
||||
|
||||
static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
|
||||
u64 eq = vmx_get_exit_qual(vcpu);
|
||||
|
||||
if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
|
||||
return false;
|
||||
|
||||
return !(eq & EPT_VIOLATION_RWX_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
|
||||
}
|
||||
|
||||
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
unsigned long exit_qual;
|
||||
gpa_t gpa = to_tdx(vcpu)->exit_gpa;
|
||||
bool local_retry = false;
|
||||
int ret;
|
||||
|
||||
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
|
||||
if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
|
||||
pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
|
||||
gpa, vcpu->vcpu_id);
|
||||
kvm_vm_dead(vcpu->kvm);
|
||||
return -EIO;
|
||||
}
|
||||
/*
|
||||
* Always treat SEPT violations as write faults. Ignore the
|
||||
* EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
|
||||
* TD private pages are always RWX in the SEPT tables,
|
||||
* i.e. they're always mapped writable. Just as importantly,
|
||||
* treating SEPT violations as write faults is necessary to
|
||||
* avoid COW allocations, which will cause TDAUGPAGE failures
|
||||
* due to aliasing a single HPA to multiple GPAs.
|
||||
*/
|
||||
exit_qual = EPT_VIOLATION_ACC_WRITE;
|
||||
|
||||
/* Only private GPA triggers zero-step mitigation */
|
||||
local_retry = true;
|
||||
} else {
|
||||
exit_qual = vmx_get_exit_qual(vcpu);
|
||||
/*
|
||||
* EPT violation due to instruction fetch should never be
|
||||
* triggered from shared memory in TDX guest. If such EPT
|
||||
* violation occurs, treat it as broken hardware.
|
||||
*/
|
||||
if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
trace_kvm_page_fault(vcpu, gpa, exit_qual);
|
||||
|
||||
/*
|
||||
* To minimize TDH.VP.ENTER invocations, retry locally for private GPA
|
||||
* mapping in TDX.
|
||||
*
|
||||
* KVM may return RET_PF_RETRY for private GPA due to
|
||||
* - contentions when atomically updating SPTEs of the mirror page table
|
||||
* - in-progress GFN invalidation or memslot removal.
|
||||
* - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
|
||||
* caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
|
||||
* or certain TDCALLs.
|
||||
*
|
||||
* If TDH.VP.ENTER is invoked more times than the threshold set by the
|
||||
* TDX module before KVM resolves the private GPA mapping, the TDX
|
||||
* module will activate zero-step mitigation during TDH.VP.ENTER. This
|
||||
* process acquires an SEPT tree lock in the TDX module, leading to
|
||||
* further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
|
||||
* operations on other vCPUs.
|
||||
*
|
||||
* Breaking out of local retries for kvm_vcpu_has_events() is for
|
||||
* interrupt injection. kvm_vcpu_has_events() should not see pending
|
||||
* events for TDX. Since KVM can't determine if IRQs (or NMIs) are
|
||||
* blocked by TDs, false positives are inevitable i.e., KVM may re-enter
|
||||
* the guest even if the IRQ/NMI can't be delivered.
|
||||
*
|
||||
* Note: even without breaking out of local retries, zero-step
|
||||
* mitigation may still occur due to
|
||||
* - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
|
||||
* - a single RIP causing EPT violations for more GFNs than the
|
||||
* threshold count.
|
||||
* This is safe, as triggering zero-step mitigation only introduces
|
||||
* contentions to page installation SEAMCALLs on other vCPUs, which will
|
||||
* handle retries locally in their EPT violation handlers.
|
||||
*/
|
||||
while (1) {
|
||||
ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
|
||||
|
||||
if (ret != RET_PF_RETRY || !local_retry)
|
||||
break;
|
||||
|
||||
if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
|
||||
break;
|
||||
|
||||
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
|
||||
{
|
||||
if (err) {
|
||||
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
|
||||
tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
|
||||
{
|
||||
struct vcpu_tdx *tdx = to_tdx(vcpu);
|
||||
@ -1709,6 +1948,11 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
|
||||
if (fastpath != EXIT_FASTPATH_NONE)
|
||||
return 1;
|
||||
|
||||
if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
|
||||
KVM_BUG_ON(1, vcpu->kvm);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
|
||||
* TDX_SEAMCALL_VMFAILINVALID.
|
||||
@ -1750,14 +1994,28 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
|
||||
case EXIT_REASON_EXTERNAL_INTERRUPT:
|
||||
++vcpu->stat.irq_exits;
|
||||
return 1;
|
||||
case EXIT_REASON_CPUID:
|
||||
return tdx_emulate_cpuid(vcpu);
|
||||
case EXIT_REASON_HLT:
|
||||
return kvm_emulate_halt_noskip(vcpu);
|
||||
case EXIT_REASON_TDCALL:
|
||||
return handle_tdvmcall(vcpu);
|
||||
case EXIT_REASON_VMCALL:
|
||||
return tdx_emulate_vmcall(vcpu);
|
||||
case EXIT_REASON_IO_INSTRUCTION:
|
||||
return tdx_emulate_io(vcpu);
|
||||
case EXIT_REASON_MSR_READ:
|
||||
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
|
||||
return kvm_emulate_rdmsr(vcpu);
|
||||
case EXIT_REASON_MSR_WRITE:
|
||||
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
|
||||
kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
|
||||
kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
|
||||
return kvm_emulate_wrmsr(vcpu);
|
||||
case EXIT_REASON_EPT_MISCONFIG:
|
||||
return tdx_emulate_mmio(vcpu);
|
||||
case EXIT_REASON_EPT_VIOLATION:
|
||||
return tdx_handle_ept_violation(vcpu);
|
||||
case EXIT_REASON_OTHER_SMI:
|
||||
/*
|
||||
* Unlike VMX, SMI in SEAM non-root mode (i.e. when
|
||||
@ -1811,6 +2069,104 @@ void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
|
||||
*error_code = 0;
|
||||
}
|
||||
|
||||
bool tdx_has_emulated_msr(u32 index)
|
||||
{
|
||||
switch (index) {
|
||||
case MSR_IA32_UCODE_REV:
|
||||
case MSR_IA32_ARCH_CAPABILITIES:
|
||||
case MSR_IA32_POWER_CTL:
|
||||
case MSR_IA32_CR_PAT:
|
||||
case MSR_MTRRcap:
|
||||
case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
|
||||
case MSR_MTRRdefType:
|
||||
case MSR_IA32_TSC_DEADLINE:
|
||||
case MSR_IA32_MISC_ENABLE:
|
||||
case MSR_PLATFORM_INFO:
|
||||
case MSR_MISC_FEATURES_ENABLES:
|
||||
case MSR_IA32_APICBASE:
|
||||
case MSR_EFER:
|
||||
case MSR_IA32_FEAT_CTL:
|
||||
case MSR_IA32_MCG_CAP:
|
||||
case MSR_IA32_MCG_STATUS:
|
||||
case MSR_IA32_MCG_CTL:
|
||||
case MSR_IA32_MCG_EXT_CTL:
|
||||
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
|
||||
case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
|
||||
/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
|
||||
case MSR_KVM_POLL_CONTROL:
|
||||
return true;
|
||||
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
|
||||
/*
|
||||
* x2APIC registers that are virtualized by the CPU can't be
|
||||
* emulated, KVM doesn't have access to the virtual APIC page.
|
||||
*/
|
||||
switch (index) {
|
||||
case X2APIC_MSR(APIC_TASKPRI):
|
||||
case X2APIC_MSR(APIC_PROCPRI):
|
||||
case X2APIC_MSR(APIC_EOI):
|
||||
case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
|
||||
case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
|
||||
case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool tdx_is_read_only_msr(u32 index)
|
||||
{
|
||||
return index == MSR_IA32_APICBASE || index == MSR_EFER ||
|
||||
index == MSR_IA32_FEAT_CTL;
|
||||
}
|
||||
|
||||
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
{
|
||||
switch (msr->index) {
|
||||
case MSR_IA32_FEAT_CTL:
|
||||
/*
|
||||
* MCE and MCA are advertised via cpuid. Guest kernel could
|
||||
* check if LMCE is enabled or not.
|
||||
*/
|
||||
msr->data = FEAT_CTL_LOCKED;
|
||||
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
|
||||
msr->data |= FEAT_CTL_LMCE_ENABLED;
|
||||
return 0;
|
||||
case MSR_IA32_MCG_EXT_CTL:
|
||||
if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
|
||||
return 1;
|
||||
msr->data = vcpu->arch.mcg_ext_ctl;
|
||||
return 0;
|
||||
default:
|
||||
if (!tdx_has_emulated_msr(msr->index))
|
||||
return 1;
|
||||
|
||||
return kvm_get_msr_common(vcpu, msr);
|
||||
}
|
||||
}
|
||||
|
||||
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
{
|
||||
switch (msr->index) {
|
||||
case MSR_IA32_MCG_EXT_CTL:
|
||||
if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
|
||||
(msr->data & ~MCG_EXT_CTL_LMCE_EN))
|
||||
return 1;
|
||||
vcpu->arch.mcg_ext_ctl = msr->data;
|
||||
return 0;
|
||||
default:
|
||||
if (tdx_is_read_only_msr(msr->index))
|
||||
return 1;
|
||||
|
||||
if (!tdx_has_emulated_msr(msr->index))
|
||||
return 1;
|
||||
|
||||
return kvm_set_msr_common(vcpu, msr);
|
||||
}
|
||||
}
|
||||
|
||||
static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
|
||||
{
|
||||
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
|
||||
@ -3120,6 +3476,11 @@ int __init tdx_bringup(void)
|
||||
goto success_disable_tdx;
|
||||
}
|
||||
|
||||
if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
|
||||
pr_err("Self-snoop is required for TDX\n");
|
||||
goto success_disable_tdx;
|
||||
}
|
||||
|
||||
if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
|
||||
pr_err("tdx: no TDX private KeyIDs available\n");
|
||||
goto success_disable_tdx;
|
||||
|
@ -37,6 +37,13 @@ struct kvm_tdx {
|
||||
|
||||
/* For KVM_TDX_INIT_MEM_REGION. */
|
||||
atomic64_t nr_premapped;
|
||||
|
||||
/*
|
||||
* Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
|
||||
* not contend with tdh_vp_enter() and TDCALLs.
|
||||
* Set/unset is protected with kvm->mmu_lock.
|
||||
*/
|
||||
bool wait_for_sept_zap;
|
||||
};
|
||||
|
||||
/* TDX module vCPU states */
|
||||
@ -116,6 +123,7 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
|
||||
}
|
||||
|
||||
static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
|
||||
static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {}
|
||||
|
||||
#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \
|
||||
static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \
|
||||
@ -163,11 +171,16 @@ static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \
|
||||
tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\
|
||||
}
|
||||
|
||||
|
||||
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu);
|
||||
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err);
|
||||
|
||||
TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
|
||||
TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
|
||||
TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
|
||||
|
||||
TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
|
||||
TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
|
||||
|
||||
#else
|
||||
static inline int tdx_bringup(void) { return 0; }
|
||||
@ -183,6 +196,9 @@ struct vcpu_tdx {
|
||||
struct kvm_vcpu vcpu;
|
||||
};
|
||||
|
||||
static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; }
|
||||
static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; }
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -37,6 +37,17 @@ enum tdx_tdcs_execution_control {
|
||||
TD_TDCS_EXEC_TSC_MULTIPLIER = 11,
|
||||
};
|
||||
|
||||
enum tdx_vcpu_guest_other_state {
|
||||
TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100,
|
||||
};
|
||||
|
||||
#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0)
|
||||
|
||||
static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details)
|
||||
{
|
||||
return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING);
|
||||
}
|
||||
|
||||
/* @field is any of enum tdx_tdcs_execution_control */
|
||||
#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field))
|
||||
|
||||
@ -70,6 +81,8 @@ struct tdx_cpuid_value {
|
||||
#define TDX_TD_ATTR_KL BIT_ULL(31)
|
||||
#define TDX_TD_ATTR_PERFMON BIT_ULL(63)
|
||||
|
||||
#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0)
|
||||
#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6
|
||||
/*
|
||||
* TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B.
|
||||
*/
|
||||
|
@ -7595,6 +7595,17 @@ int vmx_vm_init(struct kvm *kvm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
|
||||
{
|
||||
/*
|
||||
* Non-coherent DMA devices need the guest to flush CPU properly.
|
||||
* In that case it is not possible to map all guest RAM as WB, so
|
||||
* always trust guest PAT.
|
||||
*/
|
||||
return !kvm_arch_has_noncoherent_dma(kvm) &&
|
||||
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
|
||||
}
|
||||
|
||||
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
|
||||
{
|
||||
/*
|
||||
@ -7604,13 +7615,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
|
||||
if (is_mmio)
|
||||
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
|
||||
|
||||
/*
|
||||
* Force WB and ignore guest PAT if the VM does NOT have a non-coherent
|
||||
* device attached. Letting the guest control memory types on Intel
|
||||
* CPUs may result in unexpected behavior, and so KVM's ABI is to trust
|
||||
* the guest to behave only as a last resort.
|
||||
*/
|
||||
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
|
||||
/* Force WB if ignoring guest PAT */
|
||||
if (vmx_ignore_guest_pat(vcpu->kvm))
|
||||
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
|
||||
|
||||
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
|
||||
@ -8428,6 +8434,8 @@ __init int vmx_hardware_setup(void)
|
||||
if (enable_ept)
|
||||
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
|
||||
cpu_has_vmx_ept_execute_only());
|
||||
else
|
||||
vt_x86_ops.get_mt_mask = NULL;
|
||||
|
||||
/*
|
||||
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
|
||||
@ -8502,6 +8510,27 @@ __init int vmx_hardware_setup(void)
|
||||
|
||||
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
|
||||
|
||||
/*
|
||||
* On Intel CPUs that lack self-snoop feature, letting the guest control
|
||||
* memory types may result in unexpected behavior. So always ignore guest
|
||||
* PAT on those CPUs and map VM as writeback, not allowing userspace to
|
||||
* disable the quirk.
|
||||
*
|
||||
* On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
|
||||
* supported, UC is slow enough to cause issues with some older guests (e.g.
|
||||
* an old version of bochs driver uses ioremap() instead of ioremap_wc() to
|
||||
* map the video RAM, causing wayland desktop to fail to get started
|
||||
* correctly). To avoid breaking those older guests that rely on KVM to force
|
||||
* memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
|
||||
* safer (for performance) default behavior.
|
||||
*
|
||||
* On top of this, non-coherent DMA devices need the guest to flush CPU
|
||||
* caches properly. This also requires honoring guest PAT, and is forced
|
||||
* independent of the quirk in vmx_ignore_guest_pat().
|
||||
*/
|
||||
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
|
||||
kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
|
||||
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -144,6 +144,9 @@ void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
|
||||
void tdx_inject_nmi(struct kvm_vcpu *vcpu);
|
||||
void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
|
||||
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
|
||||
bool tdx_has_emulated_msr(u32 index);
|
||||
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
|
||||
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
|
||||
|
||||
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
|
||||
|
||||
@ -187,6 +190,9 @@ static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mo
|
||||
static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {}
|
||||
static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1,
|
||||
u64 *info2, u32 *intr_info, u32 *error_code) {}
|
||||
static inline bool tdx_has_emulated_msr(u32 index) { return false; }
|
||||
static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
|
||||
static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
|
||||
|
||||
static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
|
||||
|
||||
|
@ -90,7 +90,6 @@
|
||||
#include "trace.h"
|
||||
|
||||
#define MAX_IO_MSRS 256
|
||||
#define KVM_MAX_MCE_BANKS 32
|
||||
|
||||
/*
|
||||
* Note, kvm_caps fields should *never* have default values, all fields must be
|
||||
@ -4791,7 +4790,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
|
||||
r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
|
||||
break;
|
||||
case KVM_CAP_DISABLE_QUIRKS2:
|
||||
r = KVM_X86_VALID_QUIRKS;
|
||||
r = kvm_caps.supported_quirks;
|
||||
break;
|
||||
case KVM_CAP_X86_NOTIFY_VMEXIT:
|
||||
r = kvm_caps.has_notify_vmexit;
|
||||
@ -6530,11 +6529,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
|
||||
switch (cap->cap) {
|
||||
case KVM_CAP_DISABLE_QUIRKS2:
|
||||
r = -EINVAL;
|
||||
if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
|
||||
if (cap->args[0] & ~kvm_caps.supported_quirks)
|
||||
break;
|
||||
fallthrough;
|
||||
case KVM_CAP_DISABLE_QUIRKS:
|
||||
kvm->arch.disabled_quirks = cap->args[0];
|
||||
kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
|
||||
r = 0;
|
||||
break;
|
||||
case KVM_CAP_SPLIT_IRQCHIP: {
|
||||
@ -9784,6 +9783,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
|
||||
kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
|
||||
}
|
||||
kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS;
|
||||
kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS;
|
||||
|
||||
rdmsrl_safe(MSR_EFER, &kvm_host.efer);
|
||||
|
||||
@ -9828,6 +9829,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
|
||||
if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
|
||||
kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
|
||||
|
||||
/* KVM always ignores guest PAT for shadow paging. */
|
||||
if (!tdp_enabled)
|
||||
kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
|
||||
|
||||
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
|
||||
kvm_caps.supported_xss = 0;
|
||||
|
||||
@ -12734,6 +12739,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
/* Decided by the vendor code for other VM types. */
|
||||
kvm->arch.pre_fault_allowed =
|
||||
type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
|
||||
kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
|
||||
|
||||
ret = kvm_page_track_init(kvm);
|
||||
if (ret)
|
||||
@ -13561,8 +13567,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
|
||||
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
|
||||
* (or last) non-coherent device is (un)registered to so that new SPTEs
|
||||
* with the correct "ignore guest PAT" setting are created.
|
||||
*
|
||||
* If KVM always honors guest PAT, however, there is nothing to do.
|
||||
*/
|
||||
if (kvm_mmu_may_ignore_guest_pat())
|
||||
if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT))
|
||||
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,8 @@
|
||||
#include "kvm_emulate.h"
|
||||
#include "cpuid.h"
|
||||
|
||||
#define KVM_MAX_MCE_BANKS 32
|
||||
|
||||
struct kvm_caps {
|
||||
/* control of guest tsc rate supported? */
|
||||
bool has_tsc_control;
|
||||
@ -32,6 +34,9 @@ struct kvm_caps {
|
||||
u64 supported_xcr0;
|
||||
u64 supported_xss;
|
||||
u64 supported_perf_cap;
|
||||
|
||||
u64 supported_quirks;
|
||||
u64 inapplicable_quirks;
|
||||
};
|
||||
|
||||
struct kvm_host_values {
|
||||
|
Loading…
Reference in New Issue
Block a user