[RFC PATCH v4 36/39] KVM: arm64: Implement userspace API to stop a VCPU profiling

Alexandru Elisei alexandru.elisei at arm.com
Wed Aug 25 09:18:12 PDT 2021


When userspace requests that a VCPU is not allowed to profile anymore via the
KVM_ARM_VCPU_SPE_STOP attribute, keep all the register state in memory and
trap all registers, not just the buffer registers, and don't copy any of
this shadow state on the hardware.

Signed-off-by: Alexandru Elisei <alexandru.elisei at arm.com>
---
 arch/arm64/include/asm/kvm_hyp.h   |  2 +
 arch/arm64/include/asm/kvm_spe.h   | 14 +++++++
 arch/arm64/include/uapi/asm/kvm.h  |  3 ++
 arch/arm64/kvm/arm.c               |  9 ++++
 arch/arm64/kvm/debug.c             | 13 ++++--
 arch/arm64/kvm/hyp/nvhe/debug-sr.c |  4 +-
 arch/arm64/kvm/hyp/nvhe/spe-sr.c   | 24 +++++++++++
 arch/arm64/kvm/hyp/vhe/spe-sr.c    | 56 +++++++++++++++++++++++++
 arch/arm64/kvm/spe.c               | 67 ++++++++++++++++++++++++++++++
 arch/arm64/kvm/sys_regs.c          |  2 +-
 10 files changed, 188 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 03bc51049996..ce365427b483 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -86,8 +86,10 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu);
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu,
 				    struct kvm_cpu_context *host_ctxt);
+void __debug_save_spe(u64 *pmscr_el1);
 void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu,
 				       struct kvm_cpu_context *host_ctxt);
+void __debug_restore_spe(u64 pmscr_el1);
 #ifdef CONFIG_KVM_ARM_SPE
 void __spe_save_host_state_nvhe(struct kvm_vcpu *vcpu,
 				struct kvm_cpu_context *host_ctxt);
diff --git a/arch/arm64/include/asm/kvm_spe.h b/arch/arm64/include/asm/kvm_spe.h
index d7d7b9e243de..f51561e3b43f 100644
--- a/arch/arm64/include/asm/kvm_spe.h
+++ b/arch/arm64/include/asm/kvm_spe.h
@@ -16,13 +16,23 @@ static __always_inline bool kvm_supports_spe(void)
 	return static_branch_likely(&kvm_spe_available);
 }
 
+/* Guest profiling disabled by the user. */
+#define KVM_VCPU_SPE_STOP_USER		(1 << 0)
+/* Stop profiling and exit to userspace when guest starts profiling. */
+#define KVM_VCPU_SPE_STOP_USER_EXIT	(1 << 1)
+
 struct kvm_vcpu_spe {
 	bool initialized;	/* SPE initialized for the VCPU */
 	int irq_num;		/* Buffer management interrut number */
 	bool irq_level;		/* 'true' if the interrupt is asserted at the VGIC */
 	bool hwirq_level;	/* 'true' if the SPE hardware is asserting the interrupt */
+	u64 flags;
 };
 
+#define kvm_spe_profiling_stopped(vcpu)					\
+	(((vcpu)->arch.spe.flags & KVM_VCPU_SPE_STOP_USER) ||		\
+	 ((vcpu)->arch.spe.flags & KVM_VCPU_SPE_STOP_USER_EXIT))	\
+
 struct kvm_spe {
 	bool perfmon_capable;	/* Is the VM perfmon_capable()? */
 };
@@ -31,6 +41,7 @@ void kvm_spe_init_supported_cpus(void);
 void kvm_spe_vm_init(struct kvm *kvm);
 int kvm_spe_vcpu_first_run_init(struct kvm_vcpu *vcpu);
 void kvm_spe_sync_hwstate(struct kvm_vcpu *vcpu);
+bool kvm_spe_exit_to_user(struct kvm_vcpu *vcpu);
 
 void kvm_spe_write_sysreg(struct kvm_vcpu *vcpu, int reg, u64 val);
 u64 kvm_spe_read_sysreg(struct kvm_vcpu *vcpu, int reg);
@@ -48,6 +59,8 @@ int kvm_spe_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 struct kvm_vcpu_spe {
 };
 
+#define kvm_spe_profiling_stopped(vcpu)		(false)
+
 struct kvm_spe {
 };
 
@@ -55,6 +68,7 @@ static inline void kvm_spe_init_supported_cpus(void) {}
 static inline void kvm_spe_vm_init(struct kvm *kvm) {}
 static inline int kvm_spe_vcpu_first_run_init(struct kvm_vcpu *vcpu) { return -ENOEXEC; }
 static inline void kvm_spe_sync_hwstate(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_spe_exit_to_user(struct kvm_vcpu *vcpu) { return false; }
 
 static inline void kvm_spe_write_sysreg(struct kvm_vcpu *vcpu, int reg, u64 val) {}
 static inline u64 kvm_spe_read_sysreg(struct kvm_vcpu *vcpu, int reg) { return 0; }
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 75a5113f610e..63599ee39a7b 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -376,6 +376,9 @@ struct kvm_arm_copy_mte_tags {
 #define     KVM_ARM_VCPU_SPE_STOP_EXIT		(1 << 1)
 #define     KVM_ARM_VCPU_SPE_RESUME		(1 << 2)
 
+/* run->fail_entry.hardware_entry_failure_reason codes. */
+#define KVM_EXIT_FAIL_ENTRY_SPE		(1 << 0)
+
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_VCPU2_SHIFT		28
 #define KVM_ARM_IRQ_VCPU2_MASK		0xf
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ec449bc5f811..b7aae25bb9da 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -873,6 +873,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 			continue;
 		}
 
+		if (unlikely(kvm_spe_exit_to_user(vcpu))) {
+			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+			run->fail_entry.hardware_entry_failure_reason
+				= KVM_EXIT_FAIL_ENTRY_SPE;
+			ret = -EAGAIN;
+			preempt_enable();
+			continue;
+		}
+
 		kvm_pmu_flush_hwstate(vcpu);
 
 		local_irq_disable();
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 6e5fc1887215..6a4277a23bbb 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -96,11 +96,18 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
 	if (kvm_supports_spe() && kvm_vcpu_has_spe(vcpu)) {
 		/*
 		 * Use EL1&0 for the profiling buffer translation regime and
-		 * trap accesses to the buffer control registers; leave
-		 * MDCR_EL2.TPMS unset and do not trap accesses to the profiling
-		 * control registers.
+		 * trap accesses to the buffer control registers; if profiling
+		 * is stopped, also set MSCR_EL2.TMPS to trap accesses to the
+		 * rest of the registers, otherwise leave it clear.
+		 *
+		 * Leaving MDCR_EL2.E2P unset, like we do when the VCPU does not
+		 * have SPE, means that the PMBIDR_EL1.P (which KVM does not
+		 * trap) will be set and the guest will detect SPE as being
+		 * unavailable.
 		 */
 		vcpu->arch.mdcr_el2 |= MDCR_EL2_E2PB_EL1_TRAP << MDCR_EL2_E2PB_SHIFT;
+		if (kvm_spe_profiling_stopped(vcpu))
+			vcpu->arch.mdcr_el2 |= MDCR_EL2_TPMS;
 	} else {
 		/*
 		 * Trap accesses to the profiling control registers; leave
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 1622615954b2..944972de0944 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -14,7 +14,7 @@
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
-static void __debug_save_spe(u64 *pmscr_el1)
+void __debug_save_spe(u64 *pmscr_el1)
 {
 	u64 reg;
 
@@ -40,7 +40,7 @@ static void __debug_save_spe(u64 *pmscr_el1)
 	dsb(nsh);
 }
 
-static void __debug_restore_spe(u64 pmscr_el1)
+void __debug_restore_spe(u64 pmscr_el1)
 {
 	if (!pmscr_el1)
 		return;
diff --git a/arch/arm64/kvm/hyp/nvhe/spe-sr.c b/arch/arm64/kvm/hyp/nvhe/spe-sr.c
index b74131486a75..8ed03aa4f965 100644
--- a/arch/arm64/kvm/hyp/nvhe/spe-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/spe-sr.c
@@ -23,6 +23,11 @@ void __spe_save_host_state_nvhe(struct kvm_vcpu *vcpu,
 {
 	u64 pmblimitr;
 
+	if (kvm_spe_profiling_stopped(vcpu)) {
+		__debug_save_spe(__ctxt_sys_reg(host_ctxt, PMSCR_EL1));
+		return;
+	}
+
 	pmblimitr = read_sysreg_s(SYS_PMBLIMITR_EL1);
 	if (pmblimitr & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)) {
 		psb_csync();
@@ -49,6 +54,13 @@ void __spe_save_guest_state_nvhe(struct kvm_vcpu *vcpu,
 {
 	u64 pmbsr;
 
+	/*
+	 * Profiling is stopped and all register accesses are trapped, nothing
+	 * to save here.
+	 */
+	if (kvm_spe_profiling_stopped(vcpu))
+		return;
+
 	if (read_sysreg_s(SYS_PMBLIMITR_EL1) & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)) {
 		psb_csync();
 		dsb(nsh);
@@ -82,6 +94,11 @@ void __spe_save_guest_state_nvhe(struct kvm_vcpu *vcpu,
 void __spe_restore_host_state_nvhe(struct kvm_vcpu *vcpu,
 				   struct kvm_cpu_context *host_ctxt)
 {
+	if (kvm_spe_profiling_stopped(vcpu)) {
+		__debug_restore_spe(ctxt_sys_reg(host_ctxt, PMSCR_EL1));
+		return;
+	}
+
 	__spe_restore_common_state(host_ctxt);
 
 	write_sysreg_s(ctxt_sys_reg(host_ctxt, PMBPTR_EL1), SYS_PMBPTR_EL1);
@@ -94,6 +111,13 @@ void __spe_restore_host_state_nvhe(struct kvm_vcpu *vcpu,
 void __spe_restore_guest_state_nvhe(struct kvm_vcpu *vcpu,
 				    struct kvm_cpu_context *guest_ctxt)
 {
+	/*
+	 * Profiling is stopped and all register accesses are trapped, nothing
+	 * to restore here.
+	 */
+	if (kvm_spe_profiling_stopped(vcpu))
+		return;
+
 	__spe_restore_common_state(guest_ctxt);
 
 	write_sysreg_s(ctxt_sys_reg(guest_ctxt, PMBPTR_EL1), SYS_PMBPTR_EL1);
diff --git a/arch/arm64/kvm/hyp/vhe/spe-sr.c b/arch/arm64/kvm/hyp/vhe/spe-sr.c
index ea4b3b69bb32..024a4c0618cc 100644
--- a/arch/arm64/kvm/hyp/vhe/spe-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/spe-sr.c
@@ -10,6 +10,34 @@
 
 #include <hyp/spe-sr.h>
 
+static void __spe_save_host_buffer(u64 *pmscr_el2)
+{
+	u64 pmblimitr;
+
+	/* Disable guest profiling. */
+	write_sysreg_el1(0, SYS_PMSCR);
+
+	pmblimitr = read_sysreg_s(SYS_PMBLIMITR_EL1);
+	if (!(pmblimitr & BIT(SYS_PMBLIMITR_EL1_E_SHIFT))) {
+		*pmscr_el2 = 0;
+		return;
+	}
+
+	*pmscr_el2 = read_sysreg_el2(SYS_PMSCR);
+
+	/* Disable profiling at EL2 so we can drain the buffer. */
+	write_sysreg_el2(0, SYS_PMSCR);
+	isb();
+
+	/*
+	 * We're going to change the buffer owning exception level when we
+	 * activate traps, drain the buffer now.
+	 */
+	psb_csync();
+	dsb(nsh);
+}
+NOKPROBE_SYMBOL(__spe_save_host_buffer);
+
 /*
  * Disable host profiling, drain the buffer and save the host SPE context.
  * Extra care must be taken because profiling might be in progress.
@@ -19,6 +47,11 @@ void __spe_save_host_state_vhe(struct kvm_vcpu *vcpu,
 {
 	u64 pmblimitr, pmscr_el2;
 
+	if (kvm_spe_profiling_stopped(vcpu)) {
+		__spe_save_host_buffer(__ctxt_sys_reg(host_ctxt, PMSCR_EL2));
+		return;
+	}
+
 	/* Disable profiling while the SPE context is being switched. */
 	pmscr_el2 = read_sysreg_el2(SYS_PMSCR);
 	write_sysreg_el2(__vcpu_sys_reg(vcpu, PMSCR_EL2), SYS_PMSCR);
@@ -50,6 +83,9 @@ void __spe_save_guest_state_vhe(struct kvm_vcpu *vcpu,
 {
 	u64 pmblimitr, pmbsr;
 
+	if (kvm_spe_profiling_stopped(vcpu))
+		return;
+
 	/*
 	 * We're at EL2 and the buffer owning regime is EL1, which means that
 	 * profiling is disabled. After we disable traps and restore the host's
@@ -78,6 +114,18 @@ void __spe_save_guest_state_vhe(struct kvm_vcpu *vcpu,
 }
 NOKPROBE_SYMBOL(__spe_save_guest_state_vhe);
 
+static void __spe_restore_host_buffer(u64 pmscr_el2)
+{
+	if (!pmscr_el2)
+		return;
+
+	/* Synchronize MDCR_EL2 write. */
+	isb();
+
+	write_sysreg_el2(pmscr_el2, SYS_PMSCR);
+}
+NOKPROBE_SYMBOL(__spe_restore_host_buffer);
+
 /*
  * Restore the host SPE context. Special care must be taken because we're
  * potentially resuming a profiling session which was stopped when we saved the
@@ -86,6 +134,11 @@ NOKPROBE_SYMBOL(__spe_save_guest_state_vhe);
 void __spe_restore_host_state_vhe(struct kvm_vcpu *vcpu,
 				  struct kvm_cpu_context *host_ctxt)
 {
+	if (kvm_spe_profiling_stopped(vcpu)) {
+		__spe_restore_host_buffer(ctxt_sys_reg(host_ctxt, PMSCR_EL2));
+		return;
+	}
+
 	__spe_restore_common_state(host_ctxt);
 
 	write_sysreg_s(ctxt_sys_reg(host_ctxt, PMBPTR_EL1), SYS_PMBPTR_EL1);
@@ -115,6 +168,9 @@ NOKPROBE_SYMBOL(__spe_restore_host_state_vhe);
 void __spe_restore_guest_state_vhe(struct kvm_vcpu *vcpu,
 				   struct kvm_cpu_context *guest_ctxt)
 {
+	if (kvm_spe_profiling_stopped(vcpu))
+		return;
+
 	__spe_restore_common_state(guest_ctxt);
 
 	/*
diff --git a/arch/arm64/kvm/spe.c b/arch/arm64/kvm/spe.c
index 2630e777fe1d..69ca731ba9d3 100644
--- a/arch/arm64/kvm/spe.c
+++ b/arch/arm64/kvm/spe.c
@@ -140,6 +140,28 @@ void kvm_spe_sync_hwstate(struct kvm_vcpu *vcpu)
 	kvm_spe_update_irq(vcpu, true);
 }
 
+static bool kvm_spe_buffer_enabled(struct kvm_vcpu *vcpu)
+{
+	return !vcpu->arch.spe.irq_level  &&
+		(__vcpu_sys_reg(vcpu, PMBLIMITR_EL1) & BIT(SYS_PMBLIMITR_EL1_E_SHIFT));
+}
+
+bool kvm_spe_exit_to_user(struct kvm_vcpu *vcpu)
+{
+	u64 pmscr_enabled_mask = BIT(SYS_PMSCR_EL1_E0SPE_SHIFT) |
+				 BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
+
+	if (!(vcpu->arch.spe.flags & KVM_VCPU_SPE_STOP_USER_EXIT))
+		return false;
+
+	/*
+	 * We don't trap the guest dropping to EL0, so exit even if profiling is
+	 * disabled at EL1, but enabled at EL0.
+	 */
+	return kvm_spe_buffer_enabled(vcpu) &&
+		(__vcpu_sys_reg(vcpu, PMSCR_EL1) & pmscr_enabled_mask);
+}
+
 void kvm_spe_write_sysreg(struct kvm_vcpu *vcpu, int reg, u64 val)
 {
 	__vcpu_sys_reg(vcpu, reg) = val;
@@ -215,6 +237,31 @@ static bool kvm_spe_irq_is_valid(struct kvm *kvm, int irq)
 	return true;
 }
 
+static int kvm_spe_stop_user(struct kvm_vcpu *vcpu, int flags)
+{
+	struct kvm_vcpu_spe *spe = &vcpu->arch.spe;
+
+	if (flags & KVM_ARM_VCPU_SPE_STOP_TRAP) {
+		if (flags & ~KVM_ARM_VCPU_SPE_STOP_TRAP)
+			return -EINVAL;
+		spe->flags = KVM_VCPU_SPE_STOP_USER;
+	}
+
+	if (flags & KVM_ARM_VCPU_SPE_STOP_EXIT) {
+		if (flags & ~KVM_ARM_VCPU_SPE_STOP_EXIT)
+			return -EINVAL;
+		spe->flags = KVM_VCPU_SPE_STOP_USER_EXIT;
+	}
+
+	if (flags & KVM_ARM_VCPU_SPE_RESUME) {
+		if (flags & ~KVM_ARM_VCPU_SPE_RESUME)
+			return -EINVAL;
+		spe->flags = 0;
+	}
+
+	return 0;
+}
+
 int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
 	if (!kvm_vcpu_supports_spe(vcpu))
@@ -268,6 +315,8 @@ int kvm_spe_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 		if (!flags)
 			return -EINVAL;
+
+		return kvm_spe_stop_user(vcpu, flags);
 	}
 	}
 
@@ -293,6 +342,24 @@ int kvm_spe_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 		return 0;
 	}
+	case KVM_ARM_VCPU_SPE_STOP: {
+		int __user *uaddr = (int __user *)(long)attr->addr;
+		struct kvm_vcpu_spe *spe = &vcpu->arch.spe;
+		int flag = 0;
+
+		if (!vcpu->arch.spe.initialized)
+			return -EAGAIN;
+
+		if (spe->flags & KVM_VCPU_SPE_STOP_USER)
+			flag = KVM_ARM_VCPU_SPE_STOP_TRAP;
+		else if (spe->flags & KVM_VCPU_SPE_STOP_USER_EXIT)
+			flag = KVM_ARM_VCPU_SPE_STOP_EXIT;
+
+		if (put_user(flag, uaddr))
+			return -EFAULT;
+
+		return 0;
+	}
 	}
 
 	return -ENXIO;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 064742cee425..cc711b081f31 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -608,7 +608,7 @@ static bool access_spe_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 {	int reg = r->reg;
 	u64 val = p->regval;
 
-	if (reg < PMBLIMITR_EL1) {
+	if (reg < PMBLIMITR_EL1 && !kvm_spe_profiling_stopped(vcpu)) {
 		print_sys_reg_msg(p, "Unsupported guest SPE register access at: %lx [%08lx]\n",
 				  *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
 	}
-- 
2.33.0




More information about the linux-arm-kernel mailing list