[RFC PATCH v3 3/6] kvm: gmem: implement direct map manipulation routines
Mike Day
michael.day at amd.com
Thu Oct 31 07:19:41 PDT 2024
On 10/30/24 08:49, Patrick Roy wrote:
> Implement (yet unused) routines for manipulating guest_memfd direct map
> state. This is largely for illustration purposes.
>
> kvm_gmem_set_direct_map allows manipulating arbitrary pgoff_t
> ranges, even if the covered memory has not yet been faulted in (in which
> case the requested direct map state is recorded in the xarray and will
> be applied by kvm_gmem_folio_configure_direct_map after the folio is
> faulted in and prepared/populated). This can be used to realize
> private/shared conversions on not-yet-faulted in memory, as discussed in
> the guest_memfd upstream call [1].
>
> kvm_gmem_folio_set_direct_map allows manipulating the direct map entries
> for a gmem folio that the caller already holds a reference for (whereas
> kvm_gmem_set_direct_map needs to look up all folios intersecting the
> given pgoff range in the filemap first).
>
> The xa lock serializes calls to kvm_gmem_folio_set_direct_map and
> kvm_gmem_set_direct_map, while the read side
> (kvm_gmem_folio_configure_direct_map) is protected by RCU. This is
> sufficient to ensure consistency between the xarray and the folio's
> actual direct map state, as kvm_gmem_folio_configure_direct_map is
> called only for freshly allocated folios, and before the folio lock is
> dropped for the first time, meaning kvm_gmem_folio_configure_direct_map
> always does it's set_direct_map calls before either of
> kvm_gmem_[folio_]set_direct_map get a chance. Even if a concurrent call
> to kvm_gmem_[folio_]set_direct_map happens, this ensures a sort of
> "eventual consistency" between xarray and actual direct map
> configuration by the time kvm_gmem_[folio_]set_direct_map exits.
>
> [1]: https://lore.kernel.org/kvm/4b49248b-1cf1-44dc-9b50-ee551e1671ac@redhat.com/
>
> Signed-off-by: Patrick Roy <roypat at amazon.co.uk>
> ---
> virt/kvm/guest_memfd.c | 125 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 125 insertions(+)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 50ffc2ad73eda..54387828dcc6a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -96,6 +96,131 @@ static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
> return r;
> }
>
> +/*
> + * Updates the range [@start, @end] in @gmem_priv's direct map state xarray to be @state,
> + * e.g. erasing entries in this range if @state is the default state, and creating
> + * entries otherwise.
> + *
> + * Assumes the xa_lock is held.
> + */
> +static int __kvm_gmem_update_xarray(struct kvm_gmem_inode_private *gmem_priv, pgoff_t start,
> + pgoff_t end, bool state)
> +{
> + struct xarray *xa = &gmem_priv->direct_map_state;
> + int r = 0;
> +
> + /*
> + * Cannot use xa_store_range, as multi-indexes cannot easily
> + * be partially updated.
> + */
> + for (pgoff_t index = start; index < end; ++index) {
> + if (state == gmem_priv->default_direct_map_state)
> + __xa_erase(xa, index);
> + else
> + /* don't care _what_ we store in the xarray, only care about presence */
> + __xa_store(xa, index, gmem_priv, GFP_KERNEL);
> +
> + r = xa_err(xa);
> + if (r)
> + goto out;
> + }
> +
> +out:
> + return r;
> +}
> +
> +static int __kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start, pgoff_t end,
> + bool state)
> +{
> + unsigned long npages = end - start + 1;
> + struct page *first_page = folio_file_page(folio, start);
> +
> + int r = set_direct_map_valid_noflush(first_page, npages, state);
> +
> + flush_tlb_kernel_range((unsigned long)page_address(first_page),
> + (unsigned long)page_address(first_page) +
> + npages * PAGE_SIZE);
> + return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive), returning
> + * -EINVAL if this range is not completely contained within @folio. Also updates the
> + * xarray stored in the private data of the inode @folio is attached to.
> + *
> + * Takes and drops the folio lock.
> + */
> +static __always_unused int kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start,
> + pgoff_t end, bool state)
> +{
> + struct inode *inode = folio_inode(folio);
> + struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> + int r = -EINVAL;
> +
> + if (!folio_contains(folio, start) || !folio_contains(folio, end))
> + goto out;
> +
> + xa_lock(&gmem_priv->direct_map_state);
> + r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> + if (r)
> + goto unlock_xa;
> +
> + folio_lock(folio);
> + r = __kvm_gmem_folio_set_direct_map(folio, start, end, state);
> + folio_unlock(folio);
> +
> +unlock_xa:
> + xa_unlock(&gmem_priv->direct_map_state);
> +out:
> + return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive)
> + * of @inode. Folios in this range have their direct map entries reconfigured,
> + * and the xarray in the @inode's private data is updated.
> + */
> +static __always_unused int kvm_gmem_set_direct_map(struct inode *inode, pgoff_t start,
> + pgoff_t end, bool state)
> +{
> + struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> + struct folio_batch fbatch;
> + pgoff_t index = start;
> + unsigned int count, i;
> + int r = 0;
> +
> + xa_lock(&gmem_priv->direct_map_state);
> +
> + r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> + if (r)
> + goto out;
> +
if (r) {
xa_unlock(&gmem_priv->direct_map_state);
goto out;
}
thanks,
Mike
> + folio_batch_init(&fbatch);
> + while (!filemap_get_folios(inode->i_mapping, &index, end, &fbatch) && !r) {
> + count = folio_batch_count(&fbatch);
> + for (i = 0; i < count; i++) {
> + struct folio *folio = fbatch.folios[i];
> + pgoff_t folio_start = max(folio_index(folio), start);
> + pgoff_t folio_end =
> + min(folio_index(folio) + folio_nr_pages(folio),
> + end);
> +
> + folio_lock(folio);
> + r = __kvm_gmem_folio_set_direct_map(folio, folio_start,
> + folio_end, state);
> + folio_unlock(folio);
> +
> + if (r)
> + break;
> + }
> + folio_batch_release(&fbatch);
> + }
> +
> + xa_unlock(&gmem_priv->direct_map_state);
> +out:
> + return r;
> +}
> +
> /**
> * folio_file_pfn - like folio_file_page, but return a pfn.
> * @folio: The folio which contains this index.
More information about the linux-riscv
mailing list