[PATCH v6 40/64] KVM: arm64: nv: Trap and emulate AT instructions from virtual EL2
Alexandru Elisei
alexandru.elisei at arm.com
Thu Feb 24 07:39:16 PST 2022
Hi,
On Fri, Jan 28, 2022 at 12:18:48PM +0000, Marc Zyngier wrote:
> From: Jintack Lim <jintack.lim at linaro.org>
>
> When supporting nested virtualization a guest hypervisor executing AT
> instructions must be trapped and emulated by the host hypervisor,
> because untrapped AT instructions operating on S1E1 will use the wrong
> translation regieme (the one used to emulate virtual EL2 in EL1 instead
s/regieme/regime/
> of virtual EL1) and AT instructions operating on S12 will not work from
> EL1.
>
> This patch does several things.
I think this is a good hint that the patch can be split into several
patches. The size of the patch plus the emulation logic make this
patch rather tedious to review.
>
> 1. List and define all AT system instructions to emulate and document
> the emulation design.
>
> 2. Implement AT instruction handling logic in EL2. This will be used to
> emulate AT instructions executed in the virtual EL2.
>
> AT instruction emulation works by loading the proper processor
> context, which depends on the trapped instruction and the virtual
> HCR_EL2, to the EL1 virtual memory control registers and executing AT
> instructions. Note that ctxt->hw_sys_regs is expected to have the
> proper processor context before calling the handling
> function(__kvm_at_insn) implemented in this patch.
>
> 4. Emulate AT S1E[01] instructions by issuing the same instructions in
Hmm... where's point number 3?
> EL2. We set the physical EL1 registers, NV and NV1 bits as described in
> the AT instruction emulation overview.
>
> 5. Emulate AT A12E[01] instructions in two steps: First, do the stage-1
^
I'm guessing that's AT S12E[01].
> translation by reusing the existing AT emulation functions. Second, do
> the stage-2 translation by walking the guest hypervisor's stage-2 page
> table in software. Record the translation result to PAR_EL1.
>
> 6. Emulate AT S1E2 instructions by issuing the corresponding S1E1
> instructions in EL2. We set the physical EL1 registers and the HCR_EL2
> register as described in the AT instruction emulation overview.
>
> 7. Forward system instruction traps to the virtual EL2 if the corresponding
> virtual AT bit is set in the virtual HCR_EL2.
Looks like points 4-7 make good canditates for individual patches.
>
> [ Much logic above has been reworked by Marc Zyngier ]
>
> Signed-off-by: Jintack Lim <jintack.lim at linaro.org>
> Signed-off-by: Marc Zyngier <maz at kernel.org>
> Signed-off-by: Christoffer Dall <christoffer.dall at arm.com>
> ---
> arch/arm64/include/asm/kvm_arm.h | 2 +
> arch/arm64/include/asm/kvm_asm.h | 2 +
> arch/arm64/include/asm/sysreg.h | 17 +++
> arch/arm64/kvm/Makefile | 2 +-
> arch/arm64/kvm/at.c | 219 +++++++++++++++++++++++++++++
> arch/arm64/kvm/hyp/vhe/switch.c | 13 +-
> arch/arm64/kvm/sys_regs.c | 229 ++++++++++++++++++++++++++++++-
> 7 files changed, 478 insertions(+), 6 deletions(-)
> create mode 100644 arch/arm64/kvm/at.c
>
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index 3675879b53c6..aa3bdce1b166 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -20,6 +20,7 @@
> #define HCR_AMVOFFEN (UL(1) << 51)
> #define HCR_FIEN (UL(1) << 47)
> #define HCR_FWB (UL(1) << 46)
> +#define HCR_AT (UL(1) << 44)
> #define HCR_NV1 (UL(1) << 43)
> #define HCR_NV (UL(1) << 42)
> #define HCR_API (UL(1) << 41)
> @@ -118,6 +119,7 @@
> #define VTCR_EL2_TG0_16K TCR_TG0_16K
> #define VTCR_EL2_TG0_64K TCR_TG0_64K
> #define VTCR_EL2_SH0_MASK TCR_SH0_MASK
> +#define VTCR_EL2_SH0_SHIFT TCR_SH0_SHIFT
> #define VTCR_EL2_SH0_INNER TCR_SH0_INNER
> #define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK
> #define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index d5b0386ef765..e22861ece3c3 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -208,6 +208,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
> extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
>
> extern void __kvm_timer_set_cntvoff(u64 cntvoff);
> +extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
> +extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
>
> extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>
> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
> index 61c990e5591a..ea17d1eabfc5 100644
> --- a/arch/arm64/include/asm/sysreg.h
> +++ b/arch/arm64/include/asm/sysreg.h
> @@ -658,6 +658,23 @@
>
> #define SYS_SP_EL2 sys_reg(3, 6, 4, 1, 0)
>
> +/* AT instructions */
> +#define AT_Op0 1
> +#define AT_CRn 7
> +
> +#define OP_AT_S1E1R sys_insn(AT_Op0, 0, AT_CRn, 8, 0)
> +#define OP_AT_S1E1W sys_insn(AT_Op0, 0, AT_CRn, 8, 1)
> +#define OP_AT_S1E0R sys_insn(AT_Op0, 0, AT_CRn, 8, 2)
> +#define OP_AT_S1E0W sys_insn(AT_Op0, 0, AT_CRn, 8, 3)
> +#define OP_AT_S1E1RP sys_insn(AT_Op0, 0, AT_CRn, 9, 0)
> +#define OP_AT_S1E1WP sys_insn(AT_Op0, 0, AT_CRn, 9, 1)
> +#define OP_AT_S1E2R sys_insn(AT_Op0, 4, AT_CRn, 8, 0)
> +#define OP_AT_S1E2W sys_insn(AT_Op0, 4, AT_CRn, 8, 1)
> +#define OP_AT_S12E1R sys_insn(AT_Op0, 4, AT_CRn, 8, 4)
> +#define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5)
> +#define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6)
> +#define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
> +
> /* Common SCTLR_ELx flags. */
> #define SCTLR_ELx_DSSBS (BIT(44))
> #define SCTLR_ELx_ATA (BIT(43))
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index dbaf42ff65f1..b800dcbd157f 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -14,7 +14,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
> inject_fault.o va_layout.o handle_exit.o \
> guest.o debug.o reset.o sys_regs.o \
> vgic-sys-reg-v3.o fpsimd.o pmu.o pkvm.o \
> - arch_timer.o trng.o emulate-nested.o nested.o \
> + arch_timer.o trng.o emulate-nested.o nested.o at.o \
> vgic/vgic.o vgic/vgic-init.o \
> vgic/vgic-irqfd.o vgic/vgic-v2.o \
> vgic/vgic-v3.o vgic/vgic-v4.o \
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> new file mode 100644
> index 000000000000..574c664e984b
> --- /dev/null
> +++ b/arch/arm64/kvm/at.c
> @@ -0,0 +1,219 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2017 - Linaro Ltd
> + * Author: Jintack Lim <jintack.lim at linaro.org>
> + */
> +
> +#include <asm/kvm_hyp.h>
> +#include <asm/kvm_mmu.h>
> +
> +struct mmu_config {
> + u64 ttbr0;
> + u64 ttbr1;
> + u64 tcr;
> + u64 sctlr;
> + u64 vttbr;
> + u64 vtcr;
> + u64 hcr;
> +};
> +
> +static void __mmu_config_save(struct mmu_config *config)
> +{
> + config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
> + config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
> + config->tcr = read_sysreg_el1(SYS_TCR);
> + config->sctlr = read_sysreg_el1(SYS_SCTLR);
> + config->vttbr = read_sysreg(vttbr_el2);
> + config->vtcr = read_sysreg(vtcr_el2);
KVM saves VTCR_EL2, but the register is never changed between
__mmu_config_{save,restore} sequences. Another comment about this below.
> + config->hcr = read_sysreg(hcr_el2);
> +}
> +
> +static void __mmu_config_restore(struct mmu_config *config)
> +{
> + write_sysreg_el1(config->ttbr0, SYS_TTBR0);
> + write_sysreg_el1(config->ttbr1, SYS_TTBR1);
> + write_sysreg_el1(config->tcr, SYS_TCR);
> + write_sysreg_el1(config->sctlr, SYS_SCTLR);
> + write_sysreg(config->vttbr, vttbr_el2);
> + write_sysreg(config->vtcr, vtcr_el2);
> + write_sysreg(config->hcr, hcr_el2);
> +
> + isb();
> +}
> +
> +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> + struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
> + struct mmu_config config;
> + struct kvm_s2_mmu *mmu;
> +
> + spin_lock(&vcpu->kvm->mmu_lock);
> +
> + /*
> + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
> + * the right one (as we trapped from vEL2).
> + */
> + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
> + goto skip_mmu_switch;
> +
> + /*
> + * FIXME: Obtaining the S2 MMU for a guest guest is horribly
> + * racy, and we may not find it (evicted by another vcpu, for
> + * example).
> + */
I think the "horribly racy" part deserves some elaboration. As far as I can tell
get_s2_mmu_nested() and lookup_s2_mmu() is always called with kvm->mmu_lock
held.
I suppose "evicted by another vcpu" means that get_s2_mmu_nested() decided
to reuse the MMU that KVM needs, in which case it's impossible to execute
the AT instruction, as there is no shadow stage 2 for the context.
I wonder if that's something that KVM should try to avoid, One obvious
solution would be never to reuse MMUs, but that comes at the cost of
allowing a L1 guest to eat up the L0 host's memory with kvm_s2_mmu structs
by creating many L2 guests.
Another solution would be to have a software stage 1 translation table
walker which populates the shadow S2 during the walk, if and only if the
IPA is present in the virtual stage 2 with the right permissions.
What do you think?
> + mmu = lookup_s2_mmu(vcpu->kvm,
> + vcpu_read_sys_reg(vcpu, VTTBR_EL2),
> + vcpu_read_sys_reg(vcpu, HCR_EL2));
> +
> + if (WARN_ON(!mmu))
> + goto out;
If I'm not mistaken, in this case, guest PAR_EL1 is left untouched by KVM.
Shouldn't KVM set PAR_EL1 to SYS_PAR_EL1_F so the L1 guest doesn't treat
the stale PAR_EL1 value as valid?
> +
> + /* We've trapped, so everything is live on the CPU. */
> + __mmu_config_save(&config);
> +
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR);
> + write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
> + /*
> + * REVISIT: do we need anything from the guest's VTCR_EL2? If
> + * looks like keeping the hosts configuration is the right
> + * thing to do at this stage (and we could avoid save/restore
> + * it. Keep the host's version for now.
> + */
I also don't think it's necessary to load the L1 guest's VTCR_EL2 register.
The register controls virtual stage 2, which is never used because KVM will
always use the shadow stage 2.
> + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
> +
> + isb();
> +
> +skip_mmu_switch:
> +
> + switch (op) {
> + case OP_AT_S1E1R:
> + case OP_AT_S1E1RP:
> + asm volatile("at s1e1r, %0" : : "r" (vaddr));
> + break;
> + case OP_AT_S1E1W:
> + case OP_AT_S1E1WP:
> + asm volatile("at s1e1w, %0" : : "r" (vaddr));
> + break;
> + case OP_AT_S1E0R:
> + asm volatile("at s1e0r, %0" : : "r" (vaddr));
> + break;
> + case OP_AT_S1E0W:
> + asm volatile("at s1e0w, %0" : : "r" (vaddr));
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + break;
> + }
> +
> + isb();
> +
> + ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg(par_el1);
> +
> + /*
> + * Failed? let's leave the building now.
> + *
> + * FIXME: how about a failed translation because the shadow S2
> + * wasn't populated? We may need to perform a SW PTW,
> + * populating our shadow S2 and retry the instruction.
> + */
> + if (ctxt_sys_reg(ctxt, PAR_EL1) & 1)
> + goto nopan;
> +
> + /* No PAN? No problem. */
> + if (!(*vcpu_cpsr(vcpu) & PSR_PAN_BIT))
> + goto nopan;
> +
> + /*
> + * For PAN-involved AT operations, perform the same
> + * translation, using EL0 this time.
> + */
The description for FEAT_PAN is:
"When the value of this PAN state bit is 1, any privileged data access from
EL1, or EL2 when HCR_EL2.E2H is 1, to a virtual memory address that is
accessible to data accesses at EL0, generates a Permission fault."
I assume KVM executes the AT to make sure there is a valid translation
translation for the guest virtual address, right?
> + switch (op) {
> + case OP_AT_S1E1RP:
> + asm volatile("at s1e0r, %0" : : "r" (vaddr));
> + break;
> + case OP_AT_S1E1WP:
> + asm volatile("at s1e0w, %0" : : "r" (vaddr));
> + break;
> + default:
> + goto nopan;
> + }
> +
> + /*
> + * If the EL0 translation has succeeded, we need to pretend
> + * the AT operation has failed, as the PAN setting forbids
> + * such a translation.
Hmm.. according to the description of FEAT_PAN, the AT translation fails
because of PAN=1 when CurrentEL=EL2 && HCR_EL2.E2H=1. So if the VCPU is at
virtual EL2 and virtual HCR_EL2.E2H=0, then it is allowed to succeed.
Thanks,
Alex
> + *
> + * FIXME: we hardcode a Level-3 permission fault. We really
> + * should return the real fault level.
> + */
> + if (!(read_sysreg(par_el1) & 1))
> + ctxt_sys_reg(ctxt, PAR_EL1) = 0x1f;
> +
> +nopan:
> + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
> + __mmu_config_restore(&config);
> +
> +out:
> + spin_unlock(&vcpu->kvm->mmu_lock);
> +}
> +
> +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> + struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
> + struct mmu_config config;
> + struct kvm_s2_mmu *mmu;
> + u64 val;
> +
> + spin_lock(&vcpu->kvm->mmu_lock);
> +
> + mmu = &vcpu->kvm->arch.mmu;
> +
> + /* We've trapped, so everything is live on the CPU. */
> + __mmu_config_save(&config);
> +
> + if (vcpu_el2_e2h_is_set(vcpu)) {
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL2), SYS_TTBR0);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL2), SYS_TTBR1);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL2), SYS_TCR);
> + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL2), SYS_SCTLR);
> +
> + val = config.hcr;
> + } else {
> + write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL2), SYS_TTBR0);
> + val = translate_tcr_el2_to_tcr_el1(ctxt_sys_reg(ctxt, TCR_EL2));
> + write_sysreg_el1(val, SYS_TCR);
> + val = translate_sctlr_el2_to_sctlr_el1(ctxt_sys_reg(ctxt, SCTLR_EL2));
> + write_sysreg_el1(val, SYS_SCTLR);
> +
> + val = config.hcr | HCR_NV | HCR_NV1;
> + }
> +
> + write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
> + /* FIXME: write S2 MMU VTCR_EL2? */
> + write_sysreg((val & ~HCR_TGE) | HCR_VM, hcr_el2);
> +
> + isb();
> +
> + switch (op) {
> + case OP_AT_S1E2R:
> + asm volatile("at s1e1r, %0" : : "r" (vaddr));
> + break;
> + case OP_AT_S1E2W:
> + asm volatile("at s1e1w, %0" : : "r" (vaddr));
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + break;
> + }
> +
> + isb();
> +
> + /* FIXME: handle failed translation due to shadow S2 */
> + ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg(par_el1);
> +
> + __mmu_config_restore(&config);
> + spin_unlock(&vcpu->kvm->mmu_lock);
> +}
> diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
> index 28845f907cfc..b7790d3c4122 100644
> --- a/arch/arm64/kvm/hyp/vhe/switch.c
> +++ b/arch/arm64/kvm/hyp/vhe/switch.c
> @@ -41,9 +41,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
> if (!vcpu_el2_e2h_is_set(vcpu)) {
> /*
> * For a guest hypervisor on v8.0, trap and emulate
> - * the EL1 virtual memory control register accesses.
> + * the EL1 virtual memory control register accesses
> + * as well as the AT S1 operations.
> */
> - hcr |= HCR_TVM | HCR_TRVM | HCR_NV1;
> + hcr |= HCR_TVM | HCR_TRVM | HCR_AT | HCR_NV1;
> } else {
> /*
> * For a guest hypervisor on v8.1 (VHE), allow to
> @@ -68,6 +69,14 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
> hcr &= ~HCR_TVM;
>
> hcr |= vhcr_el2 & (HCR_TVM | HCR_TRVM);
> +
> + /*
> + * If we're using the EL1 translation regime
> + * (TGE clear), then ensure that AT S1 ops are
> + * trapped too.
> + */
> + if (!vcpu_el2_tge_is_set(vcpu))
> + hcr |= HCR_AT;
> }
> }
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index f669618f966b..7be57e1b7019 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -1704,7 +1704,6 @@ static bool access_sp_el1(struct kvm_vcpu *vcpu,
> return true;
> }
>
> -
> static bool access_elr(struct kvm_vcpu *vcpu,
> struct sys_reg_params *p,
> const struct sys_reg_desc *r)
> @@ -2236,12 +2235,236 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> EL2_REG(SP_EL2, NULL, reset_unknown, 0),
> };
>
> -#define SYS_INSN_TO_DESC(insn, access_fn, forward_fn) \
> - { SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
> +static bool handle_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + int sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
> +
> + if (vcpu_has_nv(vcpu) && forward_traps(vcpu, HCR_AT))
> + return false;
> +
> + __kvm_at_s1e01(vcpu, sys_encoding, p->regval);
> +
> + return true;
> +}
> +
> +static bool handle_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + int sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
> +
> + if (vcpu_has_nv(vcpu) && forward_nv_traps(vcpu))
> + return false;
> +
> + __kvm_at_s1e2(vcpu, sys_encoding, p->regval);
> +
> + return true;
> +}
> +
> +static u64 setup_par_aborted(u32 esr)
> +{
> + u64 par = 0;
> +
> + /* S [9]: fault in the stage 2 translation */
> + par |= (1 << 9);
> + /* FST [6:1]: Fault status code */
> + par |= (esr << 1);
> + /* F [0]: translation is aborted */
> + par |= 1;
> +
> + return par;
> +}
> +
> +static u64 setup_par_completed(struct kvm_vcpu *vcpu, struct kvm_s2_trans *out)
> +{
> + u64 par, vtcr_sh0;
> +
> + /* F [0]: Translation is completed successfully */
> + par = 0;
> + /* ATTR [63:56] */
> + par |= out->upper_attr;
> + /* PA [47:12] */
> + par |= out->output & GENMASK_ULL(11, 0);
> + /* RES1 [11] */
> + par |= (1UL << 11);
> + /* SH [8:7]: Shareability attribute */
> + vtcr_sh0 = vcpu_read_sys_reg(vcpu, VTCR_EL2) & VTCR_EL2_SH0_MASK;
> + par |= (vtcr_sh0 >> VTCR_EL2_SH0_SHIFT) << 7;
> +
> + return par;
> +}
> +
> +static bool handle_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r, bool write)
> +{
> + u64 par, va;
> + u32 esr, op;
> + phys_addr_t ipa;
> + struct kvm_s2_trans out;
> + int ret;
> +
> + if (vcpu_has_nv(vcpu) && forward_nv_traps(vcpu))
> + return false;
> +
> + /* Do the stage-1 translation */
> + va = p->regval;
> + op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
> + switch (op) {
> + case OP_AT_S12E1R:
> + op = OP_AT_S1E1R;
> + break;
> + case OP_AT_S12E1W:
> + op = OP_AT_S1E1W;
> + break;
> + case OP_AT_S12E0R:
> + op = OP_AT_S1E0R;
> + break;
> + case OP_AT_S12E0W:
> + op = OP_AT_S1E0W;
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + return true;
> + }
> +
> + __kvm_at_s1e01(vcpu, op, va);
> + par = vcpu_read_sys_reg(vcpu, PAR_EL1);
> + if (par & 1) {
> + /* The stage-1 translation aborted */
> + return true;
> + }
> +
> + /* Do the stage-2 translation */
> + ipa = (par & GENMASK_ULL(47, 12)) | (va & GENMASK_ULL(11, 0));
> + out.esr = 0;
> + ret = kvm_walk_nested_s2(vcpu, ipa, &out);
> + if (ret < 0)
> + return false;
> +
> + /* Check if the stage-2 PTW is aborted */
> + if (out.esr) {
> + esr = out.esr;
> + goto s2_trans_abort;
> + }
> +
> + /* Check the access permission */
> + if ((!write && !out.readable) || (write && !out.writable)) {
> + esr = ESR_ELx_FSC_PERM;
> + esr |= out.level & 0x3;
> + goto s2_trans_abort;
> + }
> +
> + vcpu_write_sys_reg(vcpu, setup_par_completed(vcpu, &out), PAR_EL1);
> + return true;
> +
> +s2_trans_abort:
> + vcpu_write_sys_reg(vcpu, setup_par_aborted(esr), PAR_EL1);
> + return true;
> +}
> +
> +static bool handle_s12r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + return handle_s12(vcpu, p, r, false);
> +}
> +
> +static bool handle_s12w(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + return handle_s12(vcpu, p, r, true);
> +}
> +
> +/*
> + * AT instruction emulation
> + *
> + * We emulate AT instructions executed in the virtual EL2.
> + * Basic strategy for the stage-1 translation emulation is to load proper
> + * context, which depends on the trapped instruction and the virtual HCR_EL2,
> + * to the EL1 virtual memory control registers and execute S1E[01] instructions
> + * in EL2. See below for more detail.
> + *
> + * For the stage-2 translation, which is necessary for S12E[01] emulation,
> + * we walk the guest hypervisor's stage-2 page table in software.
> + *
> + * The stage-1 translation emulations can be divided into two groups depending
> + * on the translation regime.
> + *
> + * 1. EL2 AT instructions: S1E2x
> + * +-----------------------------------------------------------------------+
> + * | | Setting for the emulation |
> + * | Virtual HCR_EL2.E2H on trap |-----------------------------------------+
> + * | | Phys EL1 regs | Phys NV, NV1 | Phys TGE |
> + * |-----------------------------------------------------------------------|
> + * | 0 | vEL2 | (1, 1) | 0 |
> + * | 1 | vEL2 | (0, 0) | 0 |
> + * +-----------------------------------------------------------------------+
> + *
> + * We emulate the EL2 AT instructions by loading virtual EL2 context
> + * to the EL1 virtual memory control registers and executing corresponding
> + * EL1 AT instructions.
> + *
> + * We set physical NV and NV1 bits to use EL2 page table format for non-VHE
> + * guest hypervisor (i.e. HCR_EL2.E2H == 0). As a VHE guest hypervisor uses the
> + * EL1 page table format, we don't set those bits.
> + *
> + * We should clear physical TGE bit not to use the EL2 translation regime when
> + * the host uses the VHE feature.
> + *
> + *
> + * 2. EL0/EL1 AT instructions: S1E[01]x, S12E1x
> + * +----------------------------------------------------------------------+
> + * | Virtual HCR_EL2 on trap | Setting for the emulation |
> + * |----------------------------------------------------------------------+
> + * | (vE2H, vTGE) | (vNV, vNV1) | Phys EL1 regs | Phys NV, NV1 | Phys TGE |
> + * |----------------------------------------------------------------------|
> + * | (0, 0)* | (0, 0) | vEL1 | (0, 0) | 0 |
> + * | (0, 0) | (1, 1) | vEL1 | (1, 1) | 0 |
> + * | (1, 1) | (0, 0) | vEL2 | (0, 0) | 0 |
> + * | (1, 1) | (1, 1) | vEL2 | (1, 1) | 0 |
> + * +----------------------------------------------------------------------+
> + *
> + * *For (0, 0) in the 'Virtual HCR_EL2 on trap' column, it actually means
> + * (1, 1). Keep them (0, 0) just for the readability.
> + *
> + * We set physical EL1 virtual memory control registers depending on
> + * (vE2H, vTGE) pair. When the pair is (0, 0) where AT instructions are
> + * supposed to use EL0/EL1 translation regime, we load the EL1 registers with
> + * the virtual EL1 registers (i.e. EL1 registers from the guest hypervisor's
> + * point of view). When the pair is (1, 1), however, AT instructions are defined
> + * to apply EL2 translation regime. To emulate this behavior, we load the EL1
> + * registers with the virtual EL2 context. (i.e the shadow registers)
> + *
> + * We respect the virtual NV and NV1 bit for the emulation. When those bits are
> + * set, it means that a guest hypervisor would like to use EL2 page table format
> + * for the EL1 translation regime. We emulate this by setting the physical
> + * NV and NV1 bits.
> + */
> +
> +#define SYS_INSN(insn, access_fn) \
> + { \
> + SYS_DESC(OP_##insn), \
> + .access = (access_fn), \
> + }
> +
> static struct sys_reg_desc sys_insn_descs[] = {
> { SYS_DESC(SYS_DC_ISW), access_dcsw },
> +
> + SYS_INSN(AT_S1E1R, handle_s1e01),
> + SYS_INSN(AT_S1E1W, handle_s1e01),
> + SYS_INSN(AT_S1E0R, handle_s1e01),
> + SYS_INSN(AT_S1E0W, handle_s1e01),
> + SYS_INSN(AT_S1E1RP, handle_s1e01),
> + SYS_INSN(AT_S1E1WP, handle_s1e01),
> +
> { SYS_DESC(SYS_DC_CSW), access_dcsw },
> { SYS_DESC(SYS_DC_CISW), access_dcsw },
> +
> + SYS_INSN(AT_S1E2R, handle_s1e2),
> + SYS_INSN(AT_S1E2W, handle_s1e2),
> + SYS_INSN(AT_S12E1R, handle_s12r),
> + SYS_INSN(AT_S12E1W, handle_s12w),
> + SYS_INSN(AT_S12E0R, handle_s12r),
> + SYS_INSN(AT_S12E0W, handle_s12w),
> };
>
> static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
> --
> 2.30.2
>
More information about the linux-arm-kernel
mailing list