[PATCH v6 08/25] KVM: arm64: iommu: Shadow host stage-2 page table
Mostafa Saleh
smostafa at google.com
Fri May 1 04:19:10 PDT 2026
Create a page-table for the IOMMU that shadows the host CPU stage-2
to establish DMA isolation.
An initial snapshot is created after the driver init, then
on every permission change a callback would be called for
the IOMMU driver to update the page table.
There are 3 different ways to add the callback:
1) In the high level memory transitions: (__pkvm_host_donate_hyp(),
__pkvm_host_donate_guest()...
2) In Lower level functions covering all transitions
- host_stage2_set_owner_metadata_locked() which covers:
- __pkvm_host_donate_hyp()
- __pkvm_host_donate_guest()
- __pkvm_host_donate_hyp()
- __pkvm_guest_unshare_host()
- host_stage2_set_owner_locked() only for ID_HOST which covers:
- __pkvm_hyp_donate_host()
- __pkvm_host_force_reclaim_page_guest()
- __pkvm_host_reclaim_page_guest()
- __pkvm_guest_share_host()
3) In the lowest level function __host_update_page_state(), which
requires only one callback. However, in that case the page state
is not enough as we might need to know the old state also.
Option #3 was implemented here.
For some cases, an SMMUv3 may be able to share the same page-table
used with the host CPU stage-2 directly.
However, this is too strict and requires changes to the core hypervisor
page-table code, plus it would require the hypervisor to handle IOMMU
page-faults. This can be added later as an optimization for SMMUV3.
Signed-off-by: Mostafa Saleh <smostafa at google.com>
---
arch/arm64/kvm/hyp/include/nvhe/iommu.h | 4 +
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 +
arch/arm64/kvm/hyp/nvhe/iommu/iommu.c | 108 +++++++++++++++++-
arch/arm64/kvm/hyp/nvhe/mem_protect.c | 35 ++++++
4 files changed, 146 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 1ac70cc28a9e..6277d845cdcf 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -3,11 +3,15 @@
#define __ARM64_KVM_NVHE_IOMMU_H__
#include <asm/kvm_host.h>
+#include <asm/kvm_pgtable.h>
struct kvm_iommu_ops {
int (*init)(void);
+ int (*host_stage2_idmap)(phys_addr_t start, phys_addr_t end, int prot);
};
int kvm_iommu_init(void);
+int kvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+ enum kvm_pgtable_prot prot);
#endif /* __ARM64_KVM_NVHE_IOMMU_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index ff440204d2c7..f7faecc3b70a 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -54,6 +54,8 @@ int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct
int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu);
bool addr_is_memory(phys_addr_t phys);
+u64 find_mem_range_from(u64 start, bool *is_memory);
+
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
int kvm_host_prepare_stage2(void *pgt_pool_base);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu/iommu.c
index 406c8fb9b3b9..1db52bd87c38 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/iommu.c
@@ -4,17 +4,119 @@
*
* Copyright (C) 2022 Linaro Ltd.
*/
+#include <linux/iommu.h>
+
#include <nvhe/iommu.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/spinlock.h>
/* Only one set of ops supported */
struct kvm_iommu_ops *kvm_iommu_ops;
+/* Protected by host_mmu.lock */
+static bool kvm_idmap_initialized;
+
+static inline int pkvm_to_iommu_prot(enum kvm_pgtable_prot prot)
+{
+ int iommu_prot = 0;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ iommu_prot |= IOMMU_READ;
+ if (prot & KVM_PGTABLE_PROT_W)
+ iommu_prot |= IOMMU_WRITE;
+
+ /* We don't understand that, might be dangerous. */
+ WARN_ON(prot & ~PKVM_HOST_MEM_PROT);
+ return iommu_prot;
+}
+
+static int __snapshot_host_stage2(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ u64 start = ctx->addr;
+ u64 end = start + kvm_granule_size(ctx->level);
+ kvm_pte_t pte = *ctx->ptep;
+ bool is_memory;
+ u64 region_end;
+ int prot;
+ int ret;
+
+ /*
+ * Keep annotated PTEs unmapped, and map everything else even lazily
+ * mapped MMIO with pte == 0, as the IOMMU can't handle page faults.
+ * That maps the whole address space which can be large, but that doesn't
+ * use a lot of memory as it will be mostly large block (1 GB with 4kb pages)
+ */
+ if (pte && !kvm_pte_valid(pte))
+ return 0;
+
+ if (kvm_pte_valid(pte)) {
+ prot = pkvm_to_iommu_prot(kvm_pgtable_stage2_pte_prot(pte));
+ /* If the range is mapped in a single PTE, it must be the same type.*/
+ if (!addr_is_memory(start))
+ prot |= IOMMU_MMIO;
+
+ return kvm_iommu_ops->host_stage2_idmap(start, end, prot);
+ }
+
+ /* In case of invalid PTE, we need to figure out which part of it is MMIO */
+ do {
+ prot = IOMMU_READ | IOMMU_WRITE;
+ region_end = find_mem_range_from(start, &is_memory);
+ region_end = min(end, region_end);
+ if (!is_memory)
+ prot |= IOMMU_MMIO;
+
+ ret = kvm_iommu_ops->host_stage2_idmap(start, region_end, prot);
+ if (ret)
+ return ret;
+
+ start = region_end;
+ } while (start < end);
+
+ return 0;
+}
+
+static int kvm_iommu_snapshot_host_stage2(void)
+{
+ int ret;
+ struct kvm_pgtable_walker walker = {
+ .cb = __snapshot_host_stage2,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+ struct kvm_pgtable *pgt = &host_mmu.pgt;
+
+ hyp_spin_lock(&host_mmu.lock);
+ ret = kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker);
+ /* Start receiving calls to host_stage2_idmap. */
+ kvm_idmap_initialized = !ret;
+ hyp_spin_unlock(&host_mmu.lock);
+
+ return ret;
+}
int kvm_iommu_init(void)
{
- /* Keep DMA isolation optional. */
- if (!kvm_iommu_ops || !kvm_iommu_ops->init)
+ int ret;
+
+ if (!kvm_iommu_ops || !kvm_iommu_ops->init ||
+ !kvm_iommu_ops->host_stage2_idmap)
+ return 0;
+
+ ret = kvm_iommu_ops->init();
+ if (ret)
+ return ret;
+
+ return kvm_iommu_snapshot_host_stage2();
+}
+
+int kvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+ enum kvm_pgtable_prot prot)
+{
+ hyp_assert_lock_held(&host_mmu.lock);
+
+ if (!kvm_idmap_initialized)
return 0;
- return kvm_iommu_ops->init();
+ return kvm_iommu_ops->host_stage2_idmap(start, end, pkvm_to_iommu_prot(prot));
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2fb20a63a417..b54cb72ed88c 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -15,6 +15,7 @@
#include <hyp/fault.h>
#include <nvhe/gfp.h>
+#include <nvhe/iommu.h>
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
@@ -481,6 +482,14 @@ static int check_range_allowed_memory(u64 start, u64 end)
return 0;
}
+u64 find_mem_range_from(u64 start, bool *is_memory)
+{
+ struct kvm_mem_range r;
+
+ *is_memory = !!find_mem_range(start, &r);
+ return r.end;
+}
+
static bool range_is_memory(u64 start, u64 end)
{
struct kvm_mem_range r;
@@ -577,8 +586,34 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
{
+ enum pkvm_page_state old = get_host_state(hyp_phys_to_page(addr));
+ enum kvm_pgtable_prot prot = 0;
+
for_each_hyp_page(page, addr, size)
set_host_state(page, state);
+
+ /*
+ * Any transition to PKVM_NOPAGE, unmaps the page from the host
+ * Any transition to PKVM_PAGE_SHARED_BORROWED, maps the page in the host
+ * Any transition to PKVM_PAGE_SHARED_OWNED is ignored as page is already mapped.
+ * Transitions to PKVM_PAGE_OWNED from anything but PKVM_NOPAGE are ignored.
+ * Transitions to PKVM_PAGE_OWNED from PKVM_NOPAGE will map the page.
+ */
+ if ((state == PKVM_PAGE_SHARED_OWNED) ||
+ ((state == PKVM_PAGE_OWNED) && (old != PKVM_NOPAGE)))
+ return;
+
+ if ((state == PKVM_PAGE_SHARED_BORROWED) ||
+ (state == PKVM_PAGE_OWNED))
+ prot = PKVM_HOST_MEM_PROT;
+
+ /*
+ * Only update the IOMMU from here, as MMIO can't transition after
+ * de-privilege, that will need to change when device assignment
+ * is supported.
+ * And WARN on failure as we can't unroll at this point.
+ */
+ WARN_ON(kvm_iommu_host_stage2_idmap(addr, addr + size, prot));
}
#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1)
--
2.54.0.545.g6539524ca2-goog
More information about the linux-arm-kernel
mailing list