[PATCH v2 17/18] KVM: arm64: Introduce the EL1 pKVM MMU

Thu Dec 12 03:35:09 PST 2024

On Tue, 03 Dec 2024 10:37:34 +0000,
Quentin Perret <qperret at google.com> wrote:
> 
> Introduce a set of helper functions allowing to manipulate the pKVM
> guest stage-2 page-tables from EL1 using pKVM's HVC interface.
> 
> Each helper has an exact one-to-one correspondance with the traditional
> kvm_pgtable_stage2_*() functions from pgtable.c, with a strictly
> matching prototype. This will ease plumbing later on in mmu.c.
> 
> These callbacks track the gfn->pfn mappings in a simple rb_tree indexed
> by IPA in lieu of a page-table. This rb-tree is kept in sync with pKVM's
> state and is protected by a new rwlock -- the existing mmu_lock
> protection does not suffice in the map() path where the tree must be
> modified while user_mem_abort() only acquires a read_lock.
> 
> Signed-off-by: Quentin Perret <qperret at google.com>
> ---
> 
> The embedded union inside struct kvm_pgtable is arguably a bit horrible
> currently... I considered making the pgt argument to all kvm_pgtable_*()
> functions an opaque void * ptr, and moving the definition of
> struct kvm_pgtable to pgtable.c and the pkvm version into pkvm.c. Given
> that the allocation of that data-structure is done by the caller, that
> means we'd need to expose kvm_pgtable_get_pgd_size() or something that
> each MMU (pgtable.c and pkvm.c) would have to implement and things like
> that. But that felt like a bigger surgery, so I went with the simpler
> option. Thoughts welcome :-)

I really don't think it is too bad, and I rather keep some typing
rather than going the void * route. Some comments below.

> 
> Similarly, happy to drop the mappings_lock if we want to teach
> user_mem_abort() about taking a write lock on the mmu_lock in the pKVM
> case, but again this implementation is the least invasive into normal
> KVM so that felt like a reasonable starting point.
> ---
>  arch/arm64/include/asm/kvm_host.h    |   1 +
>  arch/arm64/include/asm/kvm_pgtable.h |  27 ++--
>  arch/arm64/include/asm/kvm_pkvm.h    |  28 ++++
>  arch/arm64/kvm/pkvm.c                | 195 +++++++++++++++++++++++++++
>  4 files changed, 242 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index f75988e3515b..05936b57a3a4 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -85,6 +85,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
>  struct kvm_hyp_memcache {
>  	phys_addr_t head;
>  	unsigned long nr_pages;
> +	struct pkvm_mapping *mapping; /* only used from EL1 */
>  };
>  
>  static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 04418b5e3004..d24d18874015 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -412,15 +412,24 @@ static inline bool kvm_pgtable_walk_lock_held(void)
>   *			be used instead of block mappings.
>   */
>  struct kvm_pgtable {
> -	u32					ia_bits;
> -	s8					start_level;
> -	kvm_pteref_t				pgd;
> -	struct kvm_pgtable_mm_ops		*mm_ops;
> -
> -	/* Stage-2 only */
> -	struct kvm_s2_mmu			*mmu;
> -	enum kvm_pgtable_stage2_flags		flags;
> -	kvm_pgtable_force_pte_cb_t		force_pte_cb;
> +	union {
> +		struct {
> +			u32					ia_bits;
> +			s8					start_level;
> +			kvm_pteref_t				pgd;
> +			struct kvm_pgtable_mm_ops		*mm_ops;
> +
> +			/* Stage-2 only */
> +			struct kvm_s2_mmu			*mmu;
> +			enum kvm_pgtable_stage2_flags		flags;
> +			kvm_pgtable_force_pte_cb_t		force_pte_cb;
> +		};
> +		struct {
> +			struct kvm				*kvm;

Given that the kvm_s2_mmu already has a back-pointer to kvm_arch,
maybe you could keep that one common and use it?

There is also some baked assumption that non-NV is always using the
s2_mmu that's embedded in kvm_arch.

> +			struct rb_root				mappings;
> +			rwlock_t				mappings_lock;
> +		} pkvm;
> +	};
>  };
>  
>  /**
> diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
> index cd56acd9a842..84211d5daf87 100644
> --- a/arch/arm64/include/asm/kvm_pkvm.h
> +++ b/arch/arm64/include/asm/kvm_pkvm.h
> @@ -11,6 +11,12 @@
>  #include <linux/scatterlist.h>
>  #include <asm/kvm_pgtable.h>
>  
> +struct pkvm_mapping {
> +	u64 gfn;
> +	u64 pfn;
> +	struct rb_node node;

nit: make the node the first field.

> +};
> +
>  /* Maximum number of VMs that can co-exist under pKVM. */
>  #define KVM_MAX_PVMS 255
>  
> @@ -137,4 +143,26 @@ static inline size_t pkvm_host_sve_state_size(void)
>  			SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl)));
>  }
>  
> +static inline pkvm_handle_t pkvm_pgt_to_handle(struct kvm_pgtable *pgt)
> +{
> +	return pgt->pkvm.kvm->arch.pkvm.handle;
> +}
> +
> +int pkvm_pgtable_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops);
> +void pkvm_pgtable_destroy(struct kvm_pgtable *pgt);
> +int pkvm_pgtable_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			   u64 phys, enum kvm_pgtable_prot prot,
> +			   void *mc, enum kvm_pgtable_walk_flags flags);
> +int pkvm_pgtable_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +int pkvm_pgtable_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +int pkvm_pgtable_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +bool pkvm_pgtable_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold);
> +int pkvm_pgtable_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
> +			     enum kvm_pgtable_walk_flags flags);
> +void pkvm_pgtable_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags);
> +int pkvm_pgtable_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc);
> +void pkvm_pgtable_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level);
> +kvm_pte_t *pkvm_pgtable_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
> +					enum kvm_pgtable_prot prot, void *mc, bool force_pte);
> +
>  #endif	/* __ARM64_KVM_PKVM_H__ */
> diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
> index 85117ea8f351..9c648a510671 100644
> --- a/arch/arm64/kvm/pkvm.c
> +++ b/arch/arm64/kvm/pkvm.c
> @@ -7,6 +7,7 @@
>  #include <linux/init.h>
>  #include <linux/kmemleak.h>
>  #include <linux/kvm_host.h>
> +#include <asm/kvm_mmu.h>
>  #include <linux/memblock.h>
>  #include <linux/mutex.h>
>  #include <linux/sort.h>
> @@ -268,3 +269,197 @@ static int __init finalize_pkvm(void)
>  	return ret;
>  }
>  device_initcall_sync(finalize_pkvm);
> +
> +static int cmp_mappings(struct rb_node *node, const struct rb_node *parent)
> +{
> +	struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node);
> +	struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node);
> +
> +	if (a->gfn < b->gfn)
> +		return -1;
> +	if (a->gfn > b->gfn)
> +		return 1;
> +	return 0;
> +}
> +
> +static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn)
> +{
> +	struct rb_node *node = root->rb_node, *prev = NULL;
> +	struct pkvm_mapping *mapping;
> +
> +	while (node) {
> +		mapping = rb_entry(node, struct pkvm_mapping, node);
> +		if (mapping->gfn == gfn)
> +			return node;
> +		prev = node;
> +		node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right;
> +	}
> +
> +	return prev;
> +}
> +
> +#define for_each_mapping_in_range(pgt, start_ipa, end_ipa, mapping, tmp)				\
> +	for (tmp = find_first_mapping_node(&pgt->pkvm.mappings, ((start_ipa) >> PAGE_SHIFT));		\
> +	     tmp && ({ mapping = rb_entry(tmp, struct pkvm_mapping, node); tmp = rb_next(tmp); 1; });)	\
> +		if (mapping->gfn < ((start_ipa) >> PAGE_SHIFT))						\
> +			continue;									\
> +		else if (mapping->gfn >= ((end_ipa) >> PAGE_SHIFT))					\
> +			break;										\
> +		else

Oh gawd... This makes my head spin, and it can't be said that I'm
adverse to the most bizarre macro constructs. I came up with this:

diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 9c648a5106717..b1b8501cae8f7 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -298,13 +298,19 @@ static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn)
 	return prev;
 }
 
-#define for_each_mapping_in_range(pgt, start_ipa, end_ipa, mapping, tmp)				\
-	for (tmp = find_first_mapping_node(&pgt->pkvm.mappings, ((start_ipa) >> PAGE_SHIFT));		\
-	     tmp && ({ mapping = rb_entry(tmp, struct pkvm_mapping, node); tmp = rb_next(tmp); 1; });)	\
-		if (mapping->gfn < ((start_ipa) >> PAGE_SHIFT))						\
-			continue;									\
-		else if (mapping->gfn >= ((end_ipa) >> PAGE_SHIFT))					\
-			break;										\
+#define for_each_mapping_in_range(__pgt, __start, __end, __map)				 \
+	for (struct rb_node *__tmp = find_first_mapping_node(&__pgt->pkvm.mappings, 	 \
+							     ((__start) >> PAGE_SHIFT)); \
+	     __tmp && ({								 \
+			     __map = rb_entry(__tmp, struct pkvm_mapping, node); 	 \
+			     __tmp = rb_next(__tmp);					 \
+			     true;							 \
+		     });								 \
+	     )										 \
+		if (__map->gfn < ((__start) >> PAGE_SHIFT))				 \
+			continue;							 \
+		else if (__map->gfn >= ((__end) >> PAGE_SHIFT))				 \
+			break;								 \
 		else
 
 int pkvm_pgtable_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops)
@@ -371,11 +377,10 @@ int pkvm_pgtable_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
 	struct pkvm_mapping *mapping;
-	struct rb_node *tmp;
 	int ret = 0;
 
 	write_lock(&pgt->pkvm.mappings_lock);
-	for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp) {
+	for_each_mapping_in_range(pgt, addr, addr + size, mapping) {
 		ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
 		if (WARN_ON(ret))
 			break;
@@ -392,11 +397,10 @@ int pkvm_pgtable_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
 	struct pkvm_mapping *mapping;
-	struct rb_node *tmp;
 	int ret = 0;
 
 	read_lock(&pgt->pkvm.mappings_lock);
-	for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp) {
+	for_each_mapping_in_range(pgt, addr, addr + size, mapping) {
 		ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn);
 		if (WARN_ON(ret))
 			break;
@@ -409,10 +413,9 @@ int pkvm_pgtable_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 int pkvm_pgtable_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	struct pkvm_mapping *mapping;
-	struct rb_node *tmp;
 
 	read_lock(&pgt->pkvm.mappings_lock);
-	for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp)
+	for_each_mapping_in_range(pgt, addr, addr + size, mapping)
 		__clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE);
 	read_unlock(&pgt->pkvm.mappings_lock);
 
@@ -423,11 +426,10 @@ bool pkvm_pgtable_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size,
 {
 	pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
 	struct pkvm_mapping *mapping;
-	struct rb_node *tmp;
 	bool young = false;
 
 	read_lock(&pgt->pkvm.mappings_lock);
-	for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp)
+	for_each_mapping_in_range(pgt, addr, addr + size, mapping)
 		young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
 					   mkold);
 	read_unlock(&pgt->pkvm.mappings_lock);

Should be semantically equivalent, but I find it way more readable.

Also, maybe add a comment indicating why __tmp needs to be updated
*before* the body of the loop gets executed (case of freeing the
mapping from within the body).

> +
> +int pkvm_pgtable_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops)
> +{
> +	pgt->pkvm.kvm		= kvm_s2_mmu_to_kvm(mmu);
> +	pgt->pkvm.mappings	= RB_ROOT;
> +	rwlock_init(&pgt->pkvm.mappings_lock);

We talked about this f2f: Given that this lock is semantically
equivalent to the MMU lock, maybe just use that by upgrading it to be
taken as for write when pKVM is enabled.

It should be easy enough to wrap that in helpers that DTRT, and all
this code could become devoid of any extra locking.

[...]

> +int pkvm_pgtable_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
> +			     enum kvm_pgtable_walk_flags flags)
> +{
> +	return kvm_call_hyp_nvhe(__pkvm_host_relax_guest_perms, addr >> PAGE_SHIFT, prot);
> +}
> +
> +void pkvm_pgtable_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags)
> +{
> +	WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
> +}
> +
> +void pkvm_pgtable_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
> +{
> +	WARN_ON(1);
> +}
> +
> +kvm_pte_t *pkvm_pgtable_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
> +					enum kvm_pgtable_prot prot, void *mc, bool force_pte)
> +{
> +	WARN_ON(1);
> +	return NULL;
> +}
> +
> +int pkvm_pgtable_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc)
> +{
> +	WARN_ON(1);
> +	return -EINVAL;
> +}

Maybe turn these warnings into their _ONCE version. If we end-up here,
seeing it once should be enough to realise we're toast.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.