[RFC PATCH v3 2/6] kvm: gmem: add flag to remove memory from kernel direct map
Mike Day
michael.day at amd.com
Thu Oct 31 06:56:05 PDT 2024
On 10/30/24 08:49, Patrick Roy wrote:
> Add a new flag, KVM_GMEM_NO_DIRECT_MAP, to KVM_CREATE_GUEST_MEMFD, which
> causes KVM to remove the folios backing this guest_memfd from the direct
> map after preparation/population. This flag is only exposed on
> architectures that can set the direct map (the notable exception here
> being ARM64 if the direct map is not set up at 4K granularity),
> otherwise EOPNOTSUPP is returned.
>
> This patch also implements infrastructure for tracking (temporary)
> reinsertation of memory ranges into the direct map (more accurately: It
> allows recording that specific memory ranges deviate from the default
> direct map setup. Currently the default setup is always "direct map
> entries removed", but it is trivial to extend this with some
> "default_state_for_vm_type" mechanism to cover the pKVM usecase of
> memory starting off with directe map entries present). An xarray
> tracks this at page granularity, to be compatible with future
> hugepages usecases that might require subranges of hugetlb folios to
> have direct map entries restored. This xarray holds entries for each
> page that has a direct map state deviating from the default, and holes
> for all pages whose direct map state matches the default, the idea being
> that these "deviations" will be rare.
> kvm_gmem_folio_configure_direct_map applies the configuration stored in
> the xarray to a given folio, and is called for each new gmem folio after
> preparation/population.
>
> Storing direct map state in the gmem inode has two advantages:
> 1) We can track direct map state at page granularity even for huge
> folios (see also Ackerley's series on hugetlbfs support in
> guest_memfd [1])
> 2) We can pre-configure the direct map state of not-yet-faulted in
> folios. This would for example be needed if a VMM is receiving a
> virtio buffer that the guest is requested it to fill. In this case,
> the pages backing the guest physical address range of the buffer
> might not be faulted in yet, and thus would be faulted when the VMM
> tries to write to them, and at this point we would need to ensure
> direct map entries are present)
>
> Note that this patch does not include operations for manipulating the
> direct map state xarray, or for changing direct map state of already
> existing folios. These routines are sketched out in the following patch,
> although are not needed in this initial patch series.
>
> When a gmem folio is freed, it is reinserted into the direct map (and
> failing this, marked as HWPOISON to avoid any other part of the kernel
> accidentally touching folios without complete direct map entries). The
> direct map configuration stored in the xarray is _not_ reset when the
> folio is freed (although this could be implemented by storing the
> reference to the xarray in the folio's private data instead of only the
> inode).
>
> [1]: https://lore.kernel.org/kvm/cover.1726009989.git.ackerleytng@google.com/
>
> Signed-off-by: Patrick Roy <roypat at amazon.co.uk>
> ---
> include/uapi/linux/kvm.h | 2 +
> virt/kvm/guest_memfd.c | 150 +++++++++++++++++++++++++++++++++++----
> 2 files changed, 137 insertions(+), 15 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 637efc0551453..81b0f4a236b8c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1564,6 +1564,8 @@ struct kvm_create_guest_memfd {
> __u64 reserved[6];
> };
>
> +#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0)
> +
> #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
>
> struct kvm_pre_fault_memory {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 47a9f68f7b247..50ffc2ad73eda 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/kvm_host.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
> +#include <linux/set_memory.h>
>
> #include "kvm_mm.h"
>
> @@ -13,6 +14,88 @@ struct kvm_gmem {
> struct list_head entry;
> };
>
> +struct kvm_gmem_inode_private {
> + unsigned long long flags;
> +
> + /*
> + * direct map configuration of the gmem instance this private data
> + * is associated with. present indices indicate a desired direct map
> + * configuration deviating from default_direct_map_state (e.g. if
> + * default_direct_map_state is false/not present, then the xarray
> + * contains all indices for which direct map entries are restored).
> + */
> + struct xarray direct_map_state;
> + bool default_direct_map_state;
> +};
> +
> +static bool kvm_gmem_test_no_direct_map(struct kvm_gmem_inode_private *gmem_priv)
> +{
> + return ((unsigned long)gmem_priv->flags & KVM_GMEM_NO_DIRECT_MAP) != 0;
> +}
> +
> +/*
> + * Configure the direct map present/not present state of @folio based on
> + * the xarray stored in the associated inode's private data.
> + *
> + * Assumes the folio lock is held.
> + */
> +static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
> +{
> + struct inode *inode = folio_inode(folio);
> + struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> + bool default_state = gmem_priv->default_direct_map_state;
> +
> + pgoff_t start = folio_index(folio);
> + pgoff_t last = start + folio_nr_pages(folio) - 1;
pgoff_t last = folio_next_index(folio) - 1;
thanks,
Mike
> +
> + struct xarray *xa = &gmem_priv->direct_map_state;
> + unsigned long index;
> + void *entry;
> +
> + pgoff_t range_start = start;
> + unsigned long npages = 1;
> + int r = 0;
> +
> + if (!kvm_gmem_test_no_direct_map(gmem_priv))
> + goto out;
> +
> + r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
> + default_state);
> + if (r)
> + goto out;
> +
> + if (!xa_find_after(xa, &range_start, last, XA_PRESENT))
> + goto out_flush;
> +
> + xa_for_each_range(xa, index, entry, range_start, last) {
> + ++npages;
> +
> + if (index == range_start + npages)
> + continue;
> +
> + r = set_direct_map_valid_noflush(folio_file_page(folio, range_start), npages - 1,
> + !default_state);
> + if (r)
> + goto out_flush;
> +
> + range_start = index;
> + npages = 1;
> + }
> +
> + r = set_direct_map_valid_noflush(folio_file_page(folio, range_start), npages,
> + !default_state);
> +
> +out_flush:
> + /*
> + * Use PG_private to track that this folio has had potentially some of
> + * its direct map entries modified, so that we can restore them in free_folio.
> + */
> + folio_set_private(folio);
> + flush_tlb_kernel_range(start, start + folio_size(folio));
> +out:
> + return r;
> +}
> +
> /**
> * folio_file_pfn - like folio_file_page, but return a pfn.
> * @folio: The folio which contains this index.
> @@ -42,9 +125,19 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
> return 0;
> }
>
> -static inline void kvm_gmem_mark_prepared(struct folio *folio)
> +
> +static inline int kvm_gmem_finalize_folio(struct folio *folio)
> {
> + int r = kvm_gmem_folio_configure_direct_map(folio);
> +
> + /*
> + * Parts of the direct map might have been punched out, mark this folio
> + * as prepared even in the error case to avoid touching parts without
> + * direct map entries in a potential re-preparation.
> + */
> folio_mark_uptodate(folio);
> +
> + return r;
> }
>
> /*
> @@ -82,11 +175,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> index = ALIGN_DOWN(index, 1 << folio_order(folio));
> r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
> if (!r)
> - kvm_gmem_mark_prepared(folio);
> + r = kvm_gmem_finalize_folio(folio);
>
> return r;
> }
> -
> /*
> * Returns a locked folio on success. The caller is responsible for
> * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -249,6 +341,7 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> static int kvm_gmem_release(struct inode *inode, struct file *file)
> {
> struct kvm_gmem *gmem = file->private_data;
> + struct kvm_gmem_inode_private *gmem_priv;
> struct kvm_memory_slot *slot;
> struct kvm *kvm = gmem->kvm;
> unsigned long index;
> @@ -279,13 +372,17 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>
> list_del(&gmem->entry);
>
> + gmem_priv = inode->i_private;
> +
> filemap_invalidate_unlock(inode->i_mapping);
>
> mutex_unlock(&kvm->slots_lock);
> -
> xa_destroy(&gmem->bindings);
> kfree(gmem);
>
> + xa_destroy(&gmem_priv->direct_map_state);
> + kfree(gmem_priv);
> +
> kvm_put_kvm(kvm);
>
> return 0;
> @@ -357,24 +454,37 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
> return MF_DELAYED;
> }
>
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> static void kvm_gmem_free_folio(struct folio *folio)
> {
> +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> struct page *page = folio_page(folio, 0);
> kvm_pfn_t pfn = page_to_pfn(page);
> int order = folio_order(folio);
>
> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> -}
> #endif
>
> + if (folio_test_private(folio)) {
> + unsigned long start = (unsigned long)folio_address(folio);
> +
> + int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
> + true);
> + /*
> + * There might be holes left in the folio, better make sure
> + * nothing tries to touch it again.
> + */
> + if (r)
> + folio_set_hwpoison(folio);
> +
> + flush_tlb_kernel_range(start, start + folio_size(folio));
> + }
> +}
> +
> static const struct address_space_operations kvm_gmem_aops = {
> .dirty_folio = noop_dirty_folio,
> .migrate_folio = kvm_gmem_migrate_folio,
> .error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> .free_folio = kvm_gmem_free_folio,
> -#endif
> };
>
> static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
> @@ -401,6 +511,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> {
> const char *anon_name = "[kvm-gmem]";
> struct kvm_gmem *gmem;
> + struct kvm_gmem_inode_private *gmem_priv;
> struct inode *inode;
> struct file *file;
> int fd, err;
> @@ -409,11 +520,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> if (fd < 0)
> return fd;
>
> + err = -ENOMEM;
> gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> - if (!gmem) {
> - err = -ENOMEM;
> + if (!gmem)
> + goto err_fd;
> +
> + gmem_priv = kzalloc(sizeof(*gmem_priv), GFP_KERNEL);
> + if (!gmem_priv)
> goto err_fd;
> - }
>
> file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
> O_RDWR, NULL);
> @@ -427,7 +541,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> inode = file->f_inode;
> WARN_ON(file->f_mapping != inode->i_mapping);
>
> - inode->i_private = (void *)(unsigned long)flags;
> + inode->i_private = gmem_priv;
> inode->i_op = &kvm_gmem_iops;
> inode->i_mapping->a_ops = &kvm_gmem_aops;
> inode->i_mode |= S_IFREG;
> @@ -442,6 +556,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> xa_init(&gmem->bindings);
> list_add(&gmem->entry, &inode->i_mapping->i_private_list);
>
> + xa_init(&gmem_priv->direct_map_state);
> + gmem_priv->flags = flags;
> +
> fd_install(fd, file);
> return fd;
>
> @@ -456,11 +573,14 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> {
> loff_t size = args->size;
> u64 flags = args->flags;
> - u64 valid_flags = 0;
> + u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
>
> if (flags & ~valid_flags)
> return -EINVAL;
>
> + if ((flags & KVM_GMEM_NO_DIRECT_MAP) && !can_set_direct_map())
> + return -EOPNOTSUPP;
> +
> if (size <= 0 || !PAGE_ALIGNED(size))
> return -EINVAL;
>
> @@ -679,7 +799,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> break;
> }
>
> - folio_unlock(folio);
> WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> (npages - i) < (1 << max_order));
>
> @@ -695,7 +814,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> p = src ? src + i * PAGE_SIZE : NULL;
> ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> if (!ret)
> - kvm_gmem_mark_prepared(folio);
> + ret = kvm_gmem_finalize_folio(folio);
> + folio_unlock(folio);
>
> put_folio_and_exit:
> folio_put(folio);
More information about the linux-riscv
mailing list