[RFC PATCH v11 14/29] KVM: x86/mmu: Handle page fault for private memory
Paolo Bonzini
pbonzini at redhat.com
Fri Jul 21 08:09:40 PDT 2023
On 7/19/23 01:44, Sean Christopherson wrote:
> From: Chao Peng <chao.p.peng at linux.intel.com>
>
> A KVM_MEM_PRIVATE memslot can include both fd-based private memory and
> hva-based shared memory. Architecture code (like TDX code) can tell
> whether the on-going fault is private or not. This patch adds a
> 'is_private' field to kvm_page_fault to indicate this and architecture
> code is expected to set it.
>
> To handle page fault for such memslot, the handling logic is different
> depending on whether the fault is private or shared. KVM checks if
> 'is_private' matches the host's view of the page (maintained in
> mem_attr_array).
> - For a successful match, private pfn is obtained with
> restrictedmem_get_page() and shared pfn is obtained with existing
> get_user_pages().
> - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> userspace. Userspace then can convert memory between private/shared
> in host's view and retry the fault.
>
> Co-developed-by: Yu Zhang <yu.c.zhang at linux.intel.com>
> Signed-off-by: Yu Zhang <yu.c.zhang at linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng at linux.intel.com>
> Reviewed-by: Fuad Tabba <tabba at google.com>
> Tested-by: Fuad Tabba <tabba at google.com>
> Signed-off-by: Sean Christopherson <seanjc at google.com>
> ---
> arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++++++++++++++--
> arch/x86/kvm/mmu/mmu_internal.h | 3 ++
> arch/x86/kvm/mmu/mmutrace.h | 1 +
> 3 files changed, 81 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index aefe67185637..4cf73a579ee1 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3179,9 +3179,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
> return level;
> }
>
> -int kvm_mmu_max_mapping_level(struct kvm *kvm,
> - const struct kvm_memory_slot *slot, gfn_t gfn,
> - int max_level)
> +static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
> + const struct kvm_memory_slot *slot,
> + gfn_t gfn, int max_level, bool is_private)
> {
> struct kvm_lpage_info *linfo;
> int host_level;
> @@ -3193,6 +3193,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> break;
> }
>
> + if (is_private)
> + return max_level;
> +
> if (max_level == PG_LEVEL_4K)
> return PG_LEVEL_4K;
>
> @@ -3200,6 +3203,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> return min(host_level, max_level);
> }
>
> +int kvm_mmu_max_mapping_level(struct kvm *kvm,
> + const struct kvm_memory_slot *slot, gfn_t gfn,
> + int max_level)
> +{
> + bool is_private = kvm_slot_can_be_private(slot) &&
> + kvm_mem_is_private(kvm, gfn);
> +
> + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
> +}
> +
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -3220,8 +3233,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> * Enforce the iTLB multihit workaround after capturing the requested
> * level, which will be used to do precise, accurate accounting.
> */
> - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> - fault->gfn, fault->max_level);
> + fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> + fault->gfn, fault->max_level,
> + fault->is_private);
> if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> return;
>
> @@ -4304,6 +4318,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
> }
>
> +static inline u8 kvm_max_level_for_order(int order)
> +{
> + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
> +
> + MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
> + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
> + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
> + return PG_LEVEL_1G;
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
> + return PG_LEVEL_2M;
> +
> + return PG_LEVEL_4K;
> +}
> +
> +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault)
> +{
> + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> + if (fault->is_private)
> + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
> + else
> + vcpu->run->memory.flags = 0;
> + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
> + vcpu->run->memory.size = PAGE_SIZE;
> + return RET_PF_USER;
> +}
> +
> +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault)
> +{
> + int max_order, r;
> +
> + if (!kvm_slot_can_be_private(fault->slot))
> + return kvm_do_memory_fault_exit(vcpu, fault);
> +
> + r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
> + &max_order);
> + if (r)
> + return r;
> +
> + fault->max_level = min(kvm_max_level_for_order(max_order),
> + fault->max_level);
> + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
> + return RET_PF_CONTINUE;
> +}
> +
> static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> struct kvm_memory_slot *slot = fault->slot;
> @@ -4336,6 +4399,12 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> return RET_PF_EMULATE;
> }
>
> + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
> + return kvm_do_memory_fault_exit(vcpu, fault);
> +
> + if (fault->is_private)
> + return kvm_faultin_pfn_private(vcpu, fault);
> +
> async = false;
> fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
> fault->write, &fault->map_writable,
> @@ -5771,6 +5840,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
> return -EIO;
> }
>
> + if (r == RET_PF_USER)
> + return 0;
> +
> if (r < 0)
> return r;
> if (r != RET_PF_EMULATE)
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index d39af5639ce9..268b517e88cb 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -203,6 +203,7 @@ struct kvm_page_fault {
>
> /* Derived from mmu and global state. */
> const bool is_tdp;
> + const bool is_private;
> const bool nx_huge_page_workaround_enabled;
>
> /*
> @@ -259,6 +260,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> * RET_PF_RETRY: let CPU fault again on the address.
> * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
> * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
> + * RET_PF_USER: need to exit to userspace to handle this fault.
> * RET_PF_FIXED: The faulting entry has been fixed.
> * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
> *
> @@ -275,6 +277,7 @@ enum {
> RET_PF_RETRY,
> RET_PF_EMULATE,
> RET_PF_INVALID,
> + RET_PF_USER,
> RET_PF_FIXED,
> RET_PF_SPURIOUS,
> };
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index ae86820cef69..2d7555381955 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
> TRACE_DEFINE_ENUM(RET_PF_RETRY);
> TRACE_DEFINE_ENUM(RET_PF_EMULATE);
> TRACE_DEFINE_ENUM(RET_PF_INVALID);
> +TRACE_DEFINE_ENUM(RET_PF_USER);
> TRACE_DEFINE_ENUM(RET_PF_FIXED);
> TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
>
Reviewed-by: Paolo Bonzini <pbonzini at redhat.com>
More information about the linux-riscv
mailing list