[RFC PATCH v3 3/6] kvm: gmem: implement direct map manipulation routines

Mike Day michael.day at amd.com
Thu Oct 31 07:19:41 PDT 2024



On 10/30/24 08:49, Patrick Roy wrote:
> Implement (yet unused) routines for manipulating guest_memfd direct map
> state. This is largely for illustration purposes.
> 
> kvm_gmem_set_direct_map allows manipulating arbitrary pgoff_t
> ranges, even if the covered memory has not yet been faulted in (in which
> case the requested direct map state is recorded in the xarray and will
> be applied by kvm_gmem_folio_configure_direct_map after the folio is
> faulted in and prepared/populated). This can be used to realize
> private/shared conversions on not-yet-faulted in memory, as discussed in
> the guest_memfd upstream call [1].
> 
> kvm_gmem_folio_set_direct_map allows manipulating the direct map entries
> for a gmem folio that the caller already holds a reference for (whereas
> kvm_gmem_set_direct_map needs to look up all folios intersecting the
> given pgoff range in the filemap first).
> 
> The xa lock serializes calls to kvm_gmem_folio_set_direct_map and
> kvm_gmem_set_direct_map, while the read side
> (kvm_gmem_folio_configure_direct_map) is protected by RCU. This is
> sufficient to ensure consistency between the xarray and the folio's
> actual direct map state, as kvm_gmem_folio_configure_direct_map is
> called only for freshly allocated folios, and before the folio lock is
> dropped for the first time, meaning kvm_gmem_folio_configure_direct_map
> always does it's set_direct_map calls before either of
> kvm_gmem_[folio_]set_direct_map get a chance. Even if a concurrent call
> to kvm_gmem_[folio_]set_direct_map happens, this ensures a sort of
> "eventual consistency" between xarray and actual direct map
> configuration by the time kvm_gmem_[folio_]set_direct_map exits.
> 
> [1]: https://lore.kernel.org/kvm/4b49248b-1cf1-44dc-9b50-ee551e1671ac@redhat.com/
> 
> Signed-off-by: Patrick Roy <roypat at amazon.co.uk>
> ---
>   virt/kvm/guest_memfd.c | 125 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 125 insertions(+)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 50ffc2ad73eda..54387828dcc6a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -96,6 +96,131 @@ static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
>   	return r;
>   }
>   
> +/*
> + * Updates the range [@start, @end] in @gmem_priv's direct map state xarray to be @state,
> + * e.g. erasing entries in this range if @state is the default state, and creating
> + * entries otherwise.
> + *
> + * Assumes the xa_lock is held.
> + */
> +static int __kvm_gmem_update_xarray(struct kvm_gmem_inode_private *gmem_priv, pgoff_t start,
> +				    pgoff_t end, bool state)
> +{
> +	struct xarray *xa = &gmem_priv->direct_map_state;
> +	int r = 0;
> +
> +	/*
> +	 * Cannot use xa_store_range, as multi-indexes cannot easily
> +	 * be partially updated.
> +	 */
> +	for (pgoff_t index = start; index < end; ++index) {
> +		if (state == gmem_priv->default_direct_map_state)
> +			__xa_erase(xa, index);
> +		else
> +			/* don't care _what_ we store in the xarray, only care about presence */
> +			__xa_store(xa, index, gmem_priv, GFP_KERNEL);
> +
> +		r = xa_err(xa);
> +		if (r)
> +			goto out;
> +	}
> +
> +out:
> +	return r;
> +}
> +
> +static int __kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start, pgoff_t end,
> +					   bool state)
> +{
> +	unsigned long npages = end - start + 1;
> +	struct page *first_page = folio_file_page(folio, start);
> +
> +	int r = set_direct_map_valid_noflush(first_page, npages, state);
> +
> +	flush_tlb_kernel_range((unsigned long)page_address(first_page),
> +			       (unsigned long)page_address(first_page) +
> +				       npages * PAGE_SIZE);
> +	return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive), returning
> + * -EINVAL if this range is not completely contained within @folio. Also updates the
> + * xarray stored in the private data of the inode @folio is attached to.
> + *
> + * Takes and drops the folio lock.
> + */
> +static __always_unused int kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start,
> +								 pgoff_t end, bool state)
> +{
> +	struct inode *inode = folio_inode(folio);
> +	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> +	int r = -EINVAL;
> +
> +	if (!folio_contains(folio, start) || !folio_contains(folio, end))
> +		goto out;
> +
> +	xa_lock(&gmem_priv->direct_map_state);
> +	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> +	if (r)
> +		goto unlock_xa;
> +
> +	folio_lock(folio);
> +	r = __kvm_gmem_folio_set_direct_map(folio, start, end, state);
> +	folio_unlock(folio);
> +
> +unlock_xa:
> +	xa_unlock(&gmem_priv->direct_map_state);
> +out:
> +	return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive)
> + * of @inode. Folios in this range have their direct map entries reconfigured,
> + * and the xarray in the @inode's private data is updated.
> + */
> +static __always_unused int kvm_gmem_set_direct_map(struct inode *inode, pgoff_t start,
> +							   pgoff_t end, bool state)
> +{
> +	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> +	struct folio_batch fbatch;
> +	pgoff_t index = start;
> +	unsigned int count, i;
> +	int r = 0;
> +
> +	xa_lock(&gmem_priv->direct_map_state);
> +
> +	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> +	if (r)
> +		goto out;
> +
	if (r) {
		xa_unlock(&gmem_priv->direct_map_state);
		goto out;
	}

thanks,

Mike

> +	folio_batch_init(&fbatch);
> +	while (!filemap_get_folios(inode->i_mapping, &index, end, &fbatch) && !r) {
> +		count = folio_batch_count(&fbatch);
> +		for (i = 0; i < count; i++) {
> +			struct folio *folio = fbatch.folios[i];
> +			pgoff_t folio_start = max(folio_index(folio), start);
> +			pgoff_t folio_end =
> +				min(folio_index(folio) + folio_nr_pages(folio),
> +				    end);
> +
> +			folio_lock(folio);
> +			r = __kvm_gmem_folio_set_direct_map(folio, folio_start,
> +							    folio_end, state);
> +			folio_unlock(folio);
> +
> +			if (r)
> +				break;
> +		}
> +		folio_batch_release(&fbatch);
> +	}
> +
> +	xa_unlock(&gmem_priv->direct_map_state);
> +out:
> +	return r;
> +}
> +
>   /**
>    * folio_file_pfn - like folio_file_page, but return a pfn.
>    * @folio: The folio which contains this index.



More information about the linux-riscv mailing list