[RFC PATCH v6 22/35] KVM: arm64: VHE: Context switch SPE state
Alexandru Elisei
alexandru.elisei at arm.com
Fri Nov 14 08:07:03 PST 2025
Save and restore the SPE register state at the appropriate point in the
VCPU life cycle:
1. On VCPU load/put:
* The sampling registers for the host and guest.
* The buffer registers for the host.
2. On VCPU entry/exit:
* The buffer registers for the guest.
Note that as a consequence, when a VM has SPE enabled, KVM disables host
profiling when the VCPU is scheduled in, and it resumes it when the VCPU
is scheduled out. This is different to what happens when a VM doesn't
have SPE enabled, which is to stop sampling when the exception level
changes to EL1.
Signed-off-by: Alexandru Elisei <alexandru.elisei at arm.com>
---
arch/arm64/include/asm/kvm_hyp.h | 16 +++-
arch/arm64/include/asm/kvm_spe.h | 17 +++++
arch/arm64/kvm/arm.c | 10 +++
arch/arm64/kvm/debug.c | 10 ++-
arch/arm64/kvm/hyp/vhe/Makefile | 1 +
arch/arm64/kvm/hyp/vhe/spe-sr.c | 80 ++++++++++++++++++++
arch/arm64/kvm/hyp/vhe/switch.c | 2 +
arch/arm64/kvm/spe.c | 125 +++++++++++++++++++++++++++++++
8 files changed, 259 insertions(+), 2 deletions(-)
create mode 100644 arch/arm64/kvm/hyp/vhe/spe-sr.c
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index e6be1f5d0967..93ebe3c0d417 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -109,7 +109,21 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu);
#ifdef __KVM_NVHE_HYPERVISOR__
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu);
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
-#endif
+#else
+#ifdef CONFIG_KVM_ARM_SPE
+void __kvm_spe_save_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt);
+void __kvm_spe_restore_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt);
+#else
+static inline void
+__kvm_spe_save_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt)
+{
+}
+static inline void
+__kvm_spe_restore_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt)
+{
+}
+#endif /* CONFIG_KVM_ARM_SPE */
+#endif /* __KVM_NVHE_HYPERVISOR__ */
void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
diff --git a/arch/arm64/include/asm/kvm_spe.h b/arch/arm64/include/asm/kvm_spe.h
index 6bc728723897..077ca1e596b8 100644
--- a/arch/arm64/include/asm/kvm_spe.h
+++ b/arch/arm64/include/asm/kvm_spe.h
@@ -16,6 +16,7 @@ struct kvm_spe {
};
struct kvm_vcpu_spe {
+ u64 host_pmscr_el2; /* Host PMSCR_EL2 register, context switched. */
int irq_num; /* Buffer management interrupt number */
bool initialized; /* SPE initialized for the VCPU */
};
@@ -30,6 +31,13 @@ static __always_inline bool kvm_supports_spe(void)
#define vcpu_has_spe(vcpu) \
(vcpu_has_feature(vcpu, KVM_ARM_VCPU_SPE))
+/* Implements the function ProfilingBufferEnabled() from ARM DDI0487K.a */
+static inline bool kvm_spe_profiling_buffer_enabled(u64 pmblimitr_el1, u64 pmbsr_el1)
+{
+ return !FIELD_GET(PMBSR_EL1_S, pmbsr_el1) &&
+ FIELD_GET(PMBLIMITR_EL1_E, pmblimitr_el1);
+}
+
void kvm_spe_init_vm(struct kvm *kvm);
int kvm_spe_vcpu_first_run_init(struct kvm_vcpu *vcpu);
@@ -44,6 +52,9 @@ u64 kvm_spe_read_sysreg(struct kvm_vcpu *vcpu, int reg, u32 encoding);
bool kvm_spe_has_feat_spe_fne(struct kvm *kvm);
bool kvm_spe_has_feat_spe_fds(struct kvm *kvm);
+
+void kvm_vcpu_spe_load(struct kvm_vcpu *vcpu);
+void kvm_vcpu_spe_put(struct kvm_vcpu *vcpu);
#else
struct kvm_spe {
};
@@ -93,6 +104,12 @@ static inline bool kvm_spe_has_feat_spe_fds(struct kvm *kvm)
{
return false;
}
+static inline void kvm_vcpu_spe_load(struct kvm_vcpu *vcpu)
+{
+}
+static inline void kvm_vcpu_spe_put(struct kvm_vcpu *vcpu)
+{
+}
#endif /* CONFIG_KVM_ARM_SPE */
#endif /* __ARM64_KVM_SPE_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f5c846c16cb8..c5f5d5dbd695 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -631,6 +631,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
kvm_timer_vcpu_load(vcpu);
kvm_vgic_load(vcpu);
+ /*
+ * Drain the host profiling buffer before the buffer owning exception
+ * level is changed in kvm_vcpu_load_debug().
+ */
+ kvm_vcpu_spe_load(vcpu);
kvm_vcpu_load_debug(vcpu);
kvm_vcpu_load_fgt(vcpu);
if (has_vhe())
@@ -670,6 +675,11 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
}
kvm_vcpu_put_debug(vcpu);
+ /*
+ * Restore the host profiling session after the owning exception level
+ * is restored in kvm_vcpu_put_debug().
+ */
+ kvm_vcpu_spe_put(vcpu);
kvm_arch_vcpu_put_fp(vcpu);
if (has_vhe())
kvm_vcpu_put_vhe(vcpu);
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 0821ebfb03fa..d6357784730d 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -75,8 +75,16 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
kvm_nested_setup_mdcr_el2(vcpu);
/* Write MDCR_EL2 directly if we're already at EL2 */
- if (has_vhe())
+ if (has_vhe()) {
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+ if (vcpu_has_spe(vcpu)) {
+ /*
+ * Synchronize the write that changes the owning regime
+ * to EL1&0.
+ */
+ isb();
+ }
+ }
preempt_enable();
}
diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile
index afc4aed9231a..49496139b156 100644
--- a/arch/arm64/kvm/hyp/vhe/Makefile
+++ b/arch/arm64/kvm/hyp/vhe/Makefile
@@ -11,3 +11,4 @@ CFLAGS_switch.o += -Wno-override-init
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o
+obj-$(CONFIG_KVM_ARM_SPE) += spe-sr.o
diff --git a/arch/arm64/kvm/hyp/vhe/spe-sr.c b/arch/arm64/kvm/hyp/vhe/spe-sr.c
new file mode 100644
index 000000000000..fb8614435069
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vhe/spe-sr.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 - ARM Ltd
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+#include <asm/kprobes.h>
+#include <asm/kvm_spe.h>
+
+/*
+ * The following are true when the guest buffer is restored or saved:
+ * - Sampling is disabled.
+ * - The buffer owning regime is EL1&0.
+ * - Stage 2 is enabled.
+ */
+
+static bool kvm_spe_profiling_buffer_enabled_ctxt(struct kvm_cpu_context *ctxt)
+{
+ return kvm_spe_profiling_buffer_enabled(ctxt_sys_reg(ctxt, PMBLIMITR_EL1),
+ ctxt_sys_reg(ctxt, PMBSR_EL1));
+}
+
+void __kvm_spe_restore_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt)
+{
+ if (!vcpu_has_spe(vcpu))
+ return;
+
+ /*
+ * If StatisticalProfilingEnabled() is false or the buffer is in
+ * discard mode, the hardware value for PMBPTR_EL1 won't change while
+ * the guest is running, so no point in writing the registers to
+ * hardware.
+ *
+ * This is also about correctness. KVM runs a guest with hardware
+ * service bit clear. If the in-memory service bit is set, the only way
+ * to stop profiling while the guest is running is to have the hardware
+ * buffer enable bit clear.
+ */
+ if (!kvm_spe_profiling_buffer_enabled_ctxt(guest_ctxt))
+ return;
+
+ write_sysreg_s(ctxt_sys_reg(guest_ctxt, PMBPTR_EL1), SYS_PMBPTR_EL1);
+ isb();
+ write_sysreg_s(ctxt_sys_reg(guest_ctxt, PMBLIMITR_EL1), SYS_PMBLIMITR_EL1);
+}
+NOKPROBE_SYMBOL(__kvm_spe_restore_guest_buffer);
+
+void __kvm_spe_save_guest_buffer(struct kvm_vcpu *vcpu, struct kvm_cpu_context *guest_ctxt)
+{
+ u64 pmbsr_el1;
+
+ if (!vcpu_has_spe(vcpu))
+ return;
+
+ /* See __kvm_spe_restore_guest_buffer() */
+ if (!kvm_spe_profiling_buffer_enabled_ctxt(guest_ctxt))
+ return;
+
+ psb_csync();
+ dsb(nsh);
+ /* Advance PMBPTR_EL1. */
+ isb();
+ write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+ isb();
+
+ ctxt_sys_reg(guest_ctxt, PMBPTR_EL1) = read_sysreg_s(SYS_PMBPTR_EL1);
+
+ pmbsr_el1 = read_sysreg_s(SYS_PMBSR_EL1);
+ if (!FIELD_GET(PMBSR_EL1_S, pmbsr_el1))
+ return;
+
+ /* Stop the SPU from asserting PMBIRQ. */
+ write_sysreg_s(0, SYS_PMBSR_EL1);
+ isb();
+ /* PMBSR_EL1 changed while the VCPU was running, save it */
+ ctxt_sys_reg(guest_ctxt, PMBSR_EL1) = pmbsr_el1;
+}
+NOKPROBE_SYMBOL(__kvm_spe_save_guest_buffer);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 9984c492305a..14449d568405 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -593,6 +593,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__kvm_adjust_pc(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
+ __kvm_spe_restore_guest_buffer(vcpu, guest_ctxt);
__debug_switch_to_guest(vcpu);
do {
@@ -603,6 +604,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
} while (fixup_guest_exit(vcpu, &exit_code));
sysreg_save_guest_state_vhe(guest_ctxt);
+ __kvm_spe_save_guest_buffer(vcpu, guest_ctxt);
__deactivate_traps(vcpu);
diff --git a/arch/arm64/kvm/spe.c b/arch/arm64/kvm/spe.c
index fa24e47a1e73..32156e43f454 100644
--- a/arch/arm64/kvm/spe.c
+++ b/arch/arm64/kvm/spe.c
@@ -169,6 +169,131 @@ u64 kvm_spe_read_sysreg(struct kvm_vcpu *vcpu, int reg, u32 encoding)
return val;
}
+static void kvm_spe_save_sampling_regs(struct kvm_vcpu *vcpu, struct kvm_cpu_context *ctxt)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ ctxt_sys_reg(ctxt, PMSCR_EL1) = read_sysreg_el1(SYS_PMSCR);
+ if (kvm_spe_has_feat_spe_fne(kvm))
+ ctxt_sys_reg(ctxt, PMSNEVFR_EL1) = read_sysreg_s(SYS_PMSNEVFR_EL1);
+ ctxt_sys_reg(ctxt, PMSICR_EL1) = read_sysreg_s(SYS_PMSICR_EL1);
+ ctxt_sys_reg(ctxt, PMSIRR_EL1) = read_sysreg_s(SYS_PMSIRR_EL1);
+ ctxt_sys_reg(ctxt, PMSFCR_EL1) = read_sysreg_s(SYS_PMSFCR_EL1);
+ ctxt_sys_reg(ctxt, PMSEVFR_EL1) = read_sysreg_s(SYS_PMSEVFR_EL1);
+ ctxt_sys_reg(ctxt, PMSLATFR_EL1) = read_sysreg_s(SYS_PMSLATFR_EL1);
+ if (kvm_spe_has_feat_spe_fds(kvm))
+ ctxt_sys_reg(ctxt, PMSDSFR_EL1) = read_sysreg_s(SYS_PMSDSFR_EL1);
+}
+
+static void kvm_spe_restore_sampling_regs(struct kvm_vcpu *vcpu, struct kvm_cpu_context *ctxt)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ write_sysreg_el1(ctxt_sys_reg(ctxt, PMSCR_EL1), SYS_PMSCR);
+ if (kvm_spe_has_feat_spe_fne(kvm))
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSNEVFR_EL1), SYS_PMSNEVFR_EL1);
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSICR_EL1), SYS_PMSICR_EL1);
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSIRR_EL1), SYS_PMSIRR_EL1);
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSFCR_EL1), SYS_PMSFCR_EL1);
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSEVFR_EL1), SYS_PMSEVFR_EL1);
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSLATFR_EL1), SYS_PMSLATFR_EL1);
+ if (kvm_spe_has_feat_spe_fds(kvm))
+ write_sysreg_s(ctxt_sys_reg(ctxt, PMSDSFR_EL1), SYS_PMSDSFR_EL1);
+}
+
+void kvm_vcpu_spe_load(struct kvm_vcpu *vcpu)
+{
+ u64 host_pmblimitr_el1, host_pmscr_el2, host_pmbsr_el1;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+
+ if (!vcpu_has_spe(vcpu) || unlikely(vcpu_on_unsupported_cpu(vcpu)))
+ return;
+
+ host_ctxt = host_data_ptr(host_ctxt);
+ guest_ctxt = &vcpu->arch.ctxt;
+
+ /* Disable interrupts to prevent races with the perf interrupt handler. */
+ local_irq_disable();
+
+ host_pmscr_el2 = read_sysreg_el2(SYS_PMSCR);
+ write_sysreg_el2(0, SYS_PMSCR);
+ /* Host was profiling, synchronize the write to PMSCR_EL2. */
+ if (FIELD_GET(PMSCR_EL2_E2SPE, host_pmscr_el2))
+ isb();
+
+ host_pmblimitr_el1 = read_sysreg_s(SYS_PMBLIMITR_EL1);
+ if (FIELD_GET(PMBLIMITR_EL1_E, host_pmblimitr_el1)) {
+ psb_csync();
+ dsb(nsh);
+ /*
+ * Disable the buffer, to avoid the wrong translation table
+ * entries being cached while KVM restores the guest context.
+ */
+ write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+ /*
+ * The ISB here has two uses: hardware updates to the host's
+ * PMBPTR_EL1 register are made visible, and the write to
+ * PMBLIMITR_EL1 is synchronized.
+ */
+ isb();
+ }
+
+ host_pmbsr_el1 = read_sysreg_s(SYS_PMBSR_EL1);
+ if (FIELD_GET(PMBSR_EL1_S, host_pmbsr_el1)) {
+ /*
+ * If the GIC asserts the interrupt after local_irq_enabled()
+ * below, the perf interrupt handler will read PMBSR_EL1.S zero
+ * and treat it as a spurious interrupt.
+ */
+ write_sysreg_s(0, SYS_PMBSR_EL1);
+ isb();
+ }
+
+ local_irq_enable();
+
+ ctxt_sys_reg(host_ctxt, PMBPTR_EL1) = read_sysreg_s(SYS_PMBPTR_EL1);
+ ctxt_sys_reg(host_ctxt, PMBLIMITR_EL1) = host_pmblimitr_el1;
+ ctxt_sys_reg(host_ctxt, PMBSR_EL1) = host_pmbsr_el1;
+ vcpu->arch.vcpu_spe.host_pmscr_el2 = host_pmscr_el2;
+
+ kvm_spe_save_sampling_regs(vcpu, host_ctxt);
+ kvm_spe_restore_sampling_regs(vcpu, guest_ctxt);
+}
+
+void kvm_vcpu_spe_put(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_cpu_context *guest_ctxt;
+ u64 host_pmblimitr_el1;
+ bool buffer_enabled;
+
+ if (!vcpu_has_spe(vcpu) || unlikely(vcpu_on_unsupported_cpu(vcpu)))
+ return;
+
+ guest_ctxt = &vcpu->arch.ctxt;
+ host_ctxt = host_data_ptr(host_ctxt);
+
+ kvm_spe_save_sampling_regs(vcpu, guest_ctxt);
+ kvm_spe_restore_sampling_regs(vcpu, host_ctxt);
+
+ write_sysreg_el2(vcpu->arch.vcpu_spe.host_pmscr_el2, SYS_PMSCR);
+ write_sysreg_s(ctxt_sys_reg(host_ctxt, PMBPTR_EL1), SYS_PMBPTR_EL1);
+ write_sysreg_s(ctxt_sys_reg(host_ctxt, PMBSR_EL1), SYS_PMBSR_EL1);
+
+ host_pmblimitr_el1 = ctxt_sys_reg(host_ctxt, PMBLIMITR_EL1);
+ buffer_enabled = FIELD_GET(PMBLIMITR_EL1_E, host_pmblimitr_el1);
+
+ /* Synchronise above writes before enabling the buffer. */
+ if (buffer_enabled)
+ isb();
+
+ write_sysreg_s(host_pmblimitr_el1, SYS_PMBLIMITR_EL1);
+ /* Everything is on the hardware, re-enable the host buffer. */
+ if (buffer_enabled)
+ isb();
+}
+
static u64 max_buffer_size_to_pmbidr_el1(u64 size)
{
u64 msb_idx, num_bits;
--
2.51.2
More information about the linux-arm-kernel
mailing list