[PATCH 10/12] KVM: arm64: nv: Add SW walker for AT S1 emulation

Thu Jul 18 08:16:19 PDT 2024

Hi,

Managed to have a look at AT handling for stage 1, I've been comparing it with
AArch64.AT().

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz at kernel.org>
> ---
>  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 520 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 71e3390b43b4c..8452273cbff6d 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,305 @@
>   * Author: Jintack Lim <jintack.lim at linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +struct s1_walk_info {
> +	u64	     baddr;
> +	unsigned int max_oa_bits;
> +	unsigned int pgshift;
> +	unsigned int txsz;
> +	int 	     sl;
> +	bool	     hpd;
> +	bool	     be;
> +	bool	     nvhe;
> +	bool	     s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, const u64 va, const int el)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi;
> +
> +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->nvhe && va55)
> +		goto addrsz;
> +
> +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> +
> +	switch (el) {
> +	case 1:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case 2:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	if (!(sctlr & SCTLR_ELx_M) ||
> +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> +			goto addrsz;
> +
> +		wr->level = S1_MMU_DISABLED;
> +		wr->desc = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->nvhe ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	tbi = (wi->nvhe ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && sign_extend64(va, 55) != (s64)va)
> +		goto addrsz;
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	ia_bits = 64 - wi->txsz;

get_ia_size()?

> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	/* Check for SL mandating LPA2 (which we don't support yet) */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		if (wi->sl == -1 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
> +			goto addrsz;
> +		break;
> +	case SZ_16K:
> +		if (wi->sl == 0 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
> +			goto addrsz;
> +		break;
> +	}
> +
> +	ps = (wi->nvhe ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:	/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
> +
> +	return -EFAULT;
> +}

The function seems to be missing checks for:

- valid TxSZ
- VA is not larger than the maximum input size, as defined by TxSZ
- EPD{0,1}

> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}

This looks a lot like get_ia_size() from nested.c.

> +
> +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +		   struct s1_walk_result *wr, u64 va)
> +{
> +	u64 va_top, va_bottom, baddr, desc;
> +	int level, stride, ret;
> +
> +	level = wi->sl;
> +	stride = wi->pgshift - 3;
> +	baddr = wi->baddr;

AArch64.S1Walk() also checks that baddr is not larger than the OA size.
check_output_size() from nested.c looks almost like what you want here.

> +
> +	va_top = get_ia_size(wi) - 1;
> +
> +	while (1) {
> +		u64 index, ipa;
> +
> +		va_bottom = (3 - level) * stride + wi->pgshift;
> +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> +
> +		ipa = baddr | index;
> +
> +		if (wi->s2) {
> +			struct kvm_s2_trans s2_trans = {};
> +
> +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> +			if (ret) {
> +				fail_s1_walk(wr,
> +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> +					     true, true);
> +				return ret;
> +			}
> +
> +			if (!kvm_s2_trans_readable(&s2_trans)) {
> +				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
> +					     true, true);
> +
> +				return -EPERM;
> +			}
> +
> +			ipa = kvm_s2_trans_output(&s2_trans);
> +		}
> +
> +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> +		if (ret) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> +				     true, false);
> +			return ret;
> +		}
> +
> +		if (wi->be)
> +			desc = be64_to_cpu((__force __be64)desc);
> +		else
> +			desc = le64_to_cpu((__force __le64)desc);
> +
> +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -ENOENT;
> +		}
> +
> +		/* We found a leaf, handle that */
> +		if ((desc & 3) == 1 || level == 3)
> +			break;
> +
> +		if (!wi->hpd) {
> +			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
> +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> +		}
> +
> +		baddr = GENMASK_ULL(47, wi->pgshift);

Where is baddr updated with the value read from the descriptor? Am I missing
something obvious here?

> +
> +		/* Check for out-of-range OA */
> +		if (wi->max_oa_bits < 48 &&
> +		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
> +				     true, false);
> +			return -EINVAL;
> +		}

This looks very much like check_output_size() from nested.c.

> +
> +		/* Prepare for next round */
> +		va_top = va_bottom - 1;
> +		level++;
> +	}
> +
> +	/* Block mapping, check the validity of the level */
> +	if (!(desc & BIT(1))) {
> +		bool valid_block = false;
> +
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			valid_block = level == 1 || level == 2;
> +			break;
> +		case SZ_16K:
> +		case SZ_64K:
> +			valid_block = level == 2;
> +			break;
> +		}
> +
> +		if (!valid_block) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +	}

Matches AArch64.BlockDescSupported(), with the caveat that the walker currently
doesn't support 52 bit PAs.

> +
> +	wr->failed = false;
> +	wr->level = level;
> +	wr->desc = desc;
> +	wr->pa = desc & GENMASK(47, va_bottom);

No output size check for final PA.

> +	if (va_bottom > 12)
> +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> +
> +	return 0;
> +}
> +
>  struct mmu_config {
>  	u64	ttbr0;
>  	u64	ttbr1;
> @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  	return par;
>  }
>  
> +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
> +{
> +	u64 par;
> +
> +	if (wr->failed) {
> +		par = SYS_PAR_EL1_RES1;
> +		par |= SYS_PAR_EL1_F;
> +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> +	} else if (wr->level == S1_MMU_DISABLED) {
> +		/* MMU off or HCR_EL2.DC == 1 */
> +		par = wr->pa & GENMASK_ULL(47, 12);

That's interesting, setup_s1_walk() sets wr->desc = va and leaves wr->pa
unchanged (it's 0 from initialization in handle_at_slow()).

> +
> +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> +		} else {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> +					  MEMATTR(WbRaWa, WbRaWa));
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> +		}

This matches AArch64.S1DisabledOutput().

> +	} else {
> +		u64 mair, sctlr;
> +		int el;
> +		u8 sh;
> +
> +		el = (vcpu_el2_e2h_is_set(vcpu) &&
> +		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +		mair = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
> +			vcpu_read_sys_reg(vcpu, MAIR_EL1));
> +
> +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> +		mair &= 0xff;
> +
> +		sctlr = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
> +
> +		/* Force NC for memory if SCTLR_ELx.C is clear */
> +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> +			mair = MEMATTR(NC, NC);

This matches the compute memory attributes part of AArch64.S1Translate().

> +
> +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> +		par |= wr->pa & GENMASK_ULL(47, 12);
> +
> +		sh = compute_sh(mair, wr->desc);
> +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> +	}
> +
> +	return par;
> +}
> +
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx, el;
> +
> +	/*
> +	 * We only get here from guest EL2, so the translation regime
> +	 * AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> +	    !wi.nvhe) {

Just FYI, the 'if' statement fits on one line without going over the old 80
character limit.

> +		u64 sctlr;
> +
> +		if (el == 1)
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		else
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +
> +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);

I don't understand this. UnprivExecute is true for the memory location if and
only if **SCTLR_ELx.EPAN** && !UXN?

> +	} else {
> +		ux = false;
> +	}
> +
> +	pw = !(wr.desc & PTE_RDONLY);
> +
> +	if (wi.nvhe) {
> +		ur = uw = false;
> +		pr = true;
> +	} else {
> +		if (wr.desc & PTE_USER) {
> +			ur = pr = true;
> +			uw = pw;
> +		} else {
> +			ur = uw = false;
> +			pr = true;
> +		}
> +	}
> +
> +	/* Apply the Hierarchical Permission madness */
> +	if (wi.nvhe) {
> +		wr.APTable &= BIT(1);
> +		wr.PXNTable = wr.UXNTable;
> +	}
> +
> +	ur &= !(wr.APTable & BIT(0));
> +	uw &= !(wr.APTable != 0);
> +	ux &= !wr.UXNTable;
> +
> +	pw &= !(wr.APTable & BIT(1));

Would it make sense here to compute the resulting permisisons like in
AArch64.S1DirectBasePermissions()? I.e, look at the AP bits first, have
a switch statement for all 4 values (also makes it very easy to cross-reference
with Table D8-60), then apply hierarchical permissions/pan/epan. I do admit
that I have a very selfish reason to propose this - it makes reviewing easier.

> +
> +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +		perm_fail |= pan && (ur || uw || ux);

I had a very hard time understanding what the code is trying to do here.  How
about rewriting it to something like the pseudocode below:

  // ux = !(desc and UXN) and !UXNTable
  perm_fail |= pan && (ur || uw || ((sctlr & SCTLR_EL1_EPAN) && ux));

... which maps more closely to AArch64.S1DirectBasePermissions().

Thanks,
Alex

> +		fallthrough;
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail |= !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail |= !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail |= !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail |= !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail) {
> +		struct s1_walk_result tmp;
> +
> +		tmp.failed = true;
> +		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> +		tmp.s2 = false;
> +		tmp.ptw = false;
> +
> +		wr = tmp;
> +	}
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr);
> +}
> [..]