[RFC PATCH v3 2/6] kvm: gmem: add flag to remove memory from kernel direct map

Mike Day michael.day at amd.com
Thu Oct 31 06:56:05 PDT 2024



On 10/30/24 08:49, Patrick Roy wrote:
> Add a new flag, KVM_GMEM_NO_DIRECT_MAP, to KVM_CREATE_GUEST_MEMFD, which
> causes KVM to remove the folios backing this guest_memfd from the direct
> map after preparation/population. This flag is only exposed on
> architectures that can set the direct map (the notable exception here
> being ARM64 if the direct map is not set up at 4K granularity),
> otherwise EOPNOTSUPP is returned.
> 
> This patch also implements infrastructure for tracking (temporary)
> reinsertation of memory ranges into the direct map (more accurately: It
> allows recording that specific memory ranges deviate from the default
> direct map setup. Currently the default setup is always "direct map
> entries removed", but it is trivial to extend this with some
> "default_state_for_vm_type" mechanism to cover the pKVM usecase of
> memory starting off with directe map entries present). An xarray
> tracks this at page granularity, to be compatible with future
> hugepages usecases that might require subranges of hugetlb folios to
> have direct map entries restored. This xarray holds entries for each
> page that has a direct map state deviating from the default, and holes
> for all pages whose direct map state matches the default, the idea being
> that these "deviations" will be rare.
> kvm_gmem_folio_configure_direct_map applies the configuration stored in
> the xarray to a given folio, and is called for each new gmem folio after
> preparation/population.
> 
> Storing direct map state in the gmem inode has two advantages:
> 1) We can track direct map state at page granularity even for huge
>     folios (see also Ackerley's series on hugetlbfs support in
>     guest_memfd [1])
> 2) We can pre-configure the direct map state of not-yet-faulted in
>     folios. This would for example be needed if a VMM is receiving a
>     virtio buffer that the guest is requested it to fill. In this case,
>     the pages backing the guest physical address range of the buffer
>     might not be faulted in yet, and thus would be faulted when the VMM
>     tries to write to them, and at this point we would need to ensure
>     direct map entries are present)
> 
> Note that this patch does not include operations for manipulating the
> direct map state xarray, or for changing direct map state of already
> existing folios. These routines are sketched out in the following patch,
> although are not needed in this initial patch series.
> 
> When a gmem folio is freed, it is reinserted into the direct map (and
> failing this, marked as HWPOISON to avoid any other part of the kernel
> accidentally touching folios without complete direct map entries). The
> direct map configuration stored in the xarray is _not_ reset when the
> folio is freed (although this could be implemented by storing the
> reference to the xarray in the folio's private data instead of only the
> inode).
> 
> [1]: https://lore.kernel.org/kvm/cover.1726009989.git.ackerleytng@google.com/
> 
> Signed-off-by: Patrick Roy <roypat at amazon.co.uk>
> ---
>   include/uapi/linux/kvm.h |   2 +
>   virt/kvm/guest_memfd.c   | 150 +++++++++++++++++++++++++++++++++++----
>   2 files changed, 137 insertions(+), 15 deletions(-)
> 
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 637efc0551453..81b0f4a236b8c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1564,6 +1564,8 @@ struct kvm_create_guest_memfd {
>   	__u64 reserved[6];
>   };
>   
> +#define KVM_GMEM_NO_DIRECT_MAP			(1ULL << 0)
> +
>   #define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
>   
>   struct kvm_pre_fault_memory {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 47a9f68f7b247..50ffc2ad73eda 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
>   #include <linux/kvm_host.h>
>   #include <linux/pagemap.h>
>   #include <linux/anon_inodes.h>
> +#include <linux/set_memory.h>
>   
>   #include "kvm_mm.h"
>   
> @@ -13,6 +14,88 @@ struct kvm_gmem {
>   	struct list_head entry;
>   };
>   
> +struct kvm_gmem_inode_private {
> +	unsigned long long flags;
> +
> +	/*
> +	 * direct map configuration of the gmem instance this private data
> +	 * is associated with. present indices indicate a desired direct map
> +	 * configuration deviating from default_direct_map_state (e.g. if
> +	 * default_direct_map_state is false/not present, then the xarray
> +	 * contains all indices for which direct map entries are restored).
> +	 */
> +	struct xarray direct_map_state;
> +	bool default_direct_map_state;
> +};
> +
> +static bool kvm_gmem_test_no_direct_map(struct kvm_gmem_inode_private *gmem_priv)
> +{
> +	return ((unsigned long)gmem_priv->flags & KVM_GMEM_NO_DIRECT_MAP) != 0;
> +}
> +
> +/*
> + * Configure the direct map present/not present state of @folio based on
> + * the xarray stored in the associated inode's private data.
> + *
> + * Assumes the folio lock is held.
> + */
> +static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
> +{
> +	struct inode *inode = folio_inode(folio);
> +	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> +	bool default_state = gmem_priv->default_direct_map_state;
> +
> +	pgoff_t start = folio_index(folio);
> +	pgoff_t last = start + folio_nr_pages(folio) - 1;

	pgoff_t last = folio_next_index(folio) - 1;

thanks,
Mike

> +
> +	struct xarray *xa = &gmem_priv->direct_map_state;
> +	unsigned long index;
> +	void *entry;
> +
> +	pgoff_t range_start = start;
> +	unsigned long npages = 1;
> +	int r = 0;
> +
> +	if (!kvm_gmem_test_no_direct_map(gmem_priv))
> +		goto out;
> +
> +	r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
> +					 default_state);
> +	if (r)
> +		goto out;
> +
> +	if (!xa_find_after(xa, &range_start, last, XA_PRESENT))
> +		goto out_flush;
> +
> +	xa_for_each_range(xa, index, entry, range_start, last) {
> +		++npages;
> +
> +		if (index == range_start + npages)
> +			continue;
> +
> +		r = set_direct_map_valid_noflush(folio_file_page(folio, range_start), npages - 1,
> +						 !default_state);
> +		if (r)
> +			goto out_flush;
> +
> +		range_start = index;
> +		npages = 1;
> +	}
> +
> +	r = set_direct_map_valid_noflush(folio_file_page(folio, range_start), npages,
> +					 !default_state);
> +
> +out_flush:
> +	/*
> +	 * Use PG_private to track that this folio has had potentially some of
> +	 * its direct map entries modified, so that we can restore them in free_folio.
> +	 */
> +	folio_set_private(folio);
> +	flush_tlb_kernel_range(start, start + folio_size(folio));
> +out:
> +	return r;
> +}
> +
>   /**
>    * folio_file_pfn - like folio_file_page, but return a pfn.
>    * @folio: The folio which contains this index.
> @@ -42,9 +125,19 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>   	return 0;
>   }
>   
> -static inline void kvm_gmem_mark_prepared(struct folio *folio)
> +
> +static inline int kvm_gmem_finalize_folio(struct folio *folio)
>   {
> +	int r = kvm_gmem_folio_configure_direct_map(folio);
> +
> +	/*
> +	 * Parts of the direct map might have been punched out, mark this folio
> +	 * as prepared even in the error case to avoid touching parts without
> +	 * direct map entries in a potential re-preparation.
> +	 */
>   	folio_mark_uptodate(folio);
> +
> +	return r;
>   }
>   
>   /*
> @@ -82,11 +175,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>   	index = ALIGN_DOWN(index, 1 << folio_order(folio));
>   	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
>   	if (!r)
> -		kvm_gmem_mark_prepared(folio);
> +		r = kvm_gmem_finalize_folio(folio);
>   
>   	return r;
>   }
> -
>   /*
>    * Returns a locked folio on success.  The caller is responsible for
>    * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -249,6 +341,7 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>   static int kvm_gmem_release(struct inode *inode, struct file *file)
>   {
>   	struct kvm_gmem *gmem = file->private_data;
> +	struct kvm_gmem_inode_private *gmem_priv;
>   	struct kvm_memory_slot *slot;
>   	struct kvm *kvm = gmem->kvm;
>   	unsigned long index;
> @@ -279,13 +372,17 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>   
>   	list_del(&gmem->entry);
>   
> +	gmem_priv = inode->i_private;
> +
>   	filemap_invalidate_unlock(inode->i_mapping);
>   
>   	mutex_unlock(&kvm->slots_lock);
> -
>   	xa_destroy(&gmem->bindings);
>   	kfree(gmem);
>   
> +	xa_destroy(&gmem_priv->direct_map_state);
> +	kfree(gmem_priv);
> +
>   	kvm_put_kvm(kvm);
>   
>   	return 0;
> @@ -357,24 +454,37 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
>   	return MF_DELAYED;
>   }
>   
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>   static void kvm_gmem_free_folio(struct folio *folio)
>   {
> +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>   	struct page *page = folio_page(folio, 0);
>   	kvm_pfn_t pfn = page_to_pfn(page);
>   	int order = folio_order(folio);
>   
>   	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> -}
>   #endif
>   
> +	if (folio_test_private(folio)) {
> +		unsigned long start = (unsigned long)folio_address(folio);
> +
> +		int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
> +						     true);
> +		/*
> +		 * There might be holes left in the folio, better make sure
> +		 * nothing tries to touch it again.
> +		 */
> +		if (r)
> +			folio_set_hwpoison(folio);
> +
> +		flush_tlb_kernel_range(start, start + folio_size(folio));
> +	}
> +}
> +
>   static const struct address_space_operations kvm_gmem_aops = {
>   	.dirty_folio = noop_dirty_folio,
>   	.migrate_folio	= kvm_gmem_migrate_folio,
>   	.error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>   	.free_folio = kvm_gmem_free_folio,
> -#endif
>   };
>   
>   static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
> @@ -401,6 +511,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>   {
>   	const char *anon_name = "[kvm-gmem]";
>   	struct kvm_gmem *gmem;
> +	struct kvm_gmem_inode_private *gmem_priv;
>   	struct inode *inode;
>   	struct file *file;
>   	int fd, err;
> @@ -409,11 +520,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>   	if (fd < 0)
>   		return fd;
>   
> +	err = -ENOMEM;
>   	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> -	if (!gmem) {
> -		err = -ENOMEM;
> +	if (!gmem)
> +		goto err_fd;
> +
> +	gmem_priv = kzalloc(sizeof(*gmem_priv), GFP_KERNEL);
> +	if (!gmem_priv)
>   		goto err_fd;
> -	}
>   
>   	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
>   					 O_RDWR, NULL);
> @@ -427,7 +541,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>   	inode = file->f_inode;
>   	WARN_ON(file->f_mapping != inode->i_mapping);
>   
> -	inode->i_private = (void *)(unsigned long)flags;
> +	inode->i_private = gmem_priv;
>   	inode->i_op = &kvm_gmem_iops;
>   	inode->i_mapping->a_ops = &kvm_gmem_aops;
>   	inode->i_mode |= S_IFREG;
> @@ -442,6 +556,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>   	xa_init(&gmem->bindings);
>   	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
>   
> +	xa_init(&gmem_priv->direct_map_state);
> +	gmem_priv->flags = flags;
> +
>   	fd_install(fd, file);
>   	return fd;
>   
> @@ -456,11 +573,14 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>   {
>   	loff_t size = args->size;
>   	u64 flags = args->flags;
> -	u64 valid_flags = 0;
> +	u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
>   
>   	if (flags & ~valid_flags)
>   		return -EINVAL;
>   
> +	if ((flags & KVM_GMEM_NO_DIRECT_MAP) && !can_set_direct_map())
> +		return -EOPNOTSUPP;
> +
>   	if (size <= 0 || !PAGE_ALIGNED(size))
>   		return -EINVAL;
>   
> @@ -679,7 +799,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>   			break;
>   		}
>   
> -		folio_unlock(folio);
>   		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
>   			(npages - i) < (1 << max_order));
>   
> @@ -695,7 +814,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>   		p = src ? src + i * PAGE_SIZE : NULL;
>   		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
>   		if (!ret)
> -			kvm_gmem_mark_prepared(folio);
> +			ret = kvm_gmem_finalize_folio(folio);
> +		folio_unlock(folio);
>   
>   put_folio_and_exit:
>   		folio_put(folio);



More information about the linux-riscv mailing list