[PATCH v7 2/8] KVM: Introduce __kvm_follow_pfn function

Zhi Wang zhi.wang.linux at gmail.com
Tue Jul 11 10:37:25 PDT 2023


On Wed, 5 Jul 2023 18:08:17 +0900
David Stevens <stevensd at chromium.org> wrote:

> On Wed, Jul 5, 2023 at 5:47___PM Zhi Wang <zhi.wang.linux at gmail.com> wrote:
> >
> > On Tue,  4 Jul 2023 16:50:47 +0900
> > David Stevens <stevensd at chromium.org> wrote:
> >  
> > > From: David Stevens <stevensd at chromium.org>
> > >
> > > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> > > __kvm_follow_pfn refactors the old API's arguments into a struct and,
> > > where possible, combines the boolean arguments into a single flags
> > > argument.
> > >
> > > Signed-off-by: David Stevens <stevensd at chromium.org>
> > > ---
> > >  include/linux/kvm_host.h |  16 ++++
> > >  virt/kvm/kvm_main.c      | 171 ++++++++++++++++++++++-----------------
> > >  virt/kvm/kvm_mm.h        |   3 +-
> > >  virt/kvm/pfncache.c      |   8 +-
> > >  4 files changed, 122 insertions(+), 76 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 9d3ac7720da9..ef2763c2b12e 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -97,6 +97,7 @@
> > >  #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
> > >  #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
> > >  #define KVM_PFN_ERR_SIGPENDING       (KVM_PFN_ERR_MASK + 3)
> > > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4)
> > >
> > >  /*
> > >   * error pfns indicate that the gfn is in slot but faild to
> > > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> > >  void kvm_release_page_clean(struct page *page);
> > >  void kvm_release_page_dirty(struct page *page);
> > >
> > > +struct kvm_follow_pfn {
> > > +     const struct kvm_memory_slot *slot;
> > > +     gfn_t gfn;
> > > +     unsigned int flags;
> > > +     bool atomic;
> > > +     /* Allow a read fault to create a writeable mapping. */
> > > +     bool allow_write_mapping;
> > > +
> > > +     /* Outputs of __kvm_follow_pfn */
> > > +     hva_t hva;
> > > +     bool writable;
> > > +};
> > > +
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> > > +
> > >  kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
> > >  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
> > >                     bool *writable);
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 371bd783ff2b..b13f22861d2f 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> > >   * true indicates success, otherwise false is returned.  It's also the
> > >   * only part that runs if we can in atomic context.
> > >   */
> > > -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > -                         bool *writable, kvm_pfn_t *pfn)
> > > +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >  {
> > >       struct page *page[1];
> > > +     bool write_fault = foll->flags & FOLL_WRITE;
> > >
> > >       /*
> > >        * Fast pin a writable pfn only if it is a write fault request
> > >        * or the caller allows to map a writable pfn for a read fault
> > >        * request.
> > >        */
> > > -     if (!(write_fault || writable))
> > > +     if (!(write_fault || foll->allow_write_mapping))
> > >               return false;
> > >
> > > -     if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> > > +     if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> > >               *pfn = page_to_pfn(page[0]);
> > > -
> > > -             if (writable)
> > > -                     *writable = true;
> > > +             foll->writable = foll->allow_write_mapping;
> > >               return true;
> > >       }
> > >
> > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > >   * The slow path to get the pfn of the specified host virtual address,
> > >   * 1 indicates success, -errno is returned if error is detected.
> > >   */
> > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > -                        bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >  {
> > > -     unsigned int flags = FOLL_HWPOISON;
> > > +     unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > >       struct page *page;
> > >       int npages;
> > >
> > >       might_sleep();
> > >
> > > -     if (writable)
> > > -             *writable = write_fault;
> > > -
> > > -     if (write_fault)
> > > -             flags |= FOLL_WRITE;
> > > -     if (async)
> > > -             flags |= FOLL_NOWAIT;
> > > -     if (interruptible)
> > > -             flags |= FOLL_INTERRUPTIBLE;
> > > -
> > > -     npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > +     npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > >       if (npages != 1)
> > >               return npages;
> > >
> > > +     foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > +
> > >       /* map read fault as writable if possible */
> > > -     if (unlikely(!write_fault) && writable) {
> > > +     if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > >               struct page *wpage;
> > >
> > > -             if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> > > -                     *writable = true;
> > > +             if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> > > +                     foll->writable = true;
> > >                       put_page(page);
> > >                       page = wpage;
> > >               }
> > > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
> > >       return get_page_unless_zero(page);
> > >  }
> > >
> > > -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > -                            unsigned long addr, bool write_fault,
> > > -                            bool *writable, kvm_pfn_t *p_pfn)
> > > +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> > > +                            kvm_pfn_t *p_pfn)
> > >  {
> > >       kvm_pfn_t pfn;
> > >       pte_t *ptep;
> > >       spinlock_t *ptl;
> > > +     bool write_fault = foll->flags & FOLL_WRITE;
> > >       int r;
> > >
> > > -     r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > +     r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > >       if (r) {
> > >               /*
> > >                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> > >                * not call the fault handler, so do it here.
> > >                */
> > >               bool unlocked = false;
> > > -             r = fixup_user_fault(current->mm, addr,
> > > +             r = fixup_user_fault(current->mm, foll->hva,
> > >                                    (write_fault ? FAULT_FLAG_WRITE : 0),
> > >                                    &unlocked);
> > >               if (unlocked)
> > > @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >               if (r)
> > >                       return r;
> > >
> > > -             r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > +             r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > >               if (r)
> > >                       return r;
> > >       }
> > > @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >               goto out;
> > >       }
> > >
> > > -     if (writable)
> > > -             *writable = pte_write(*ptep);
> > > +     foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
> > >       pfn = pte_pfn(*ptep);
> > >
> > >       /*
> > > @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > >   * 2): @write_fault = false && @writable, @writable will tell the caller
> > >   *     whether the mapping is writable.
> > >   */
> > > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > > -                  bool *async, bool write_fault, bool *writable)
> > > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
> > >  {
> > >       struct vm_area_struct *vma;
> > >       kvm_pfn_t pfn;
> > >       int npages, r;
> > >
> > >       /* we can do it either atomically or asynchronously, not both */
> > > -     BUG_ON(atomic && async);
> > > +     BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
> > >
> > > -     if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> > > +     if (hva_to_pfn_fast(foll, &pfn))
> > >               return pfn;
> > >
> > > -     if (atomic)
> > > +     if (foll->atomic)
> > >               return KVM_PFN_ERR_FAULT;
> > >
> > > -     npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> > > -                              writable, &pfn);
> > > +     npages = hva_to_pfn_slow(foll, &pfn);
> > >       if (npages == 1)
> > >               return pfn;
> > >       if (npages == -EINTR)
> > > @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > >
> > >       mmap_read_lock(current->mm);
> > >       if (npages == -EHWPOISON ||
> > > -           (!async && check_user_page_hwpoison(addr))) {
> > > +           (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
> > >               pfn = KVM_PFN_ERR_HWPOISON;
> > >               goto exit;
> > >       }
> > >
> > >  retry:
> > > -     vma = vma_lookup(current->mm, addr);
> > > +     vma = vma_lookup(current->mm, foll->hva);
> > >
> > >       if (vma == NULL)
> > >               pfn = KVM_PFN_ERR_FAULT;
> > >       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > > -             r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> > > +             r = hva_to_pfn_remapped(vma, foll, &pfn);
> > >               if (r == -EAGAIN)
> > >                       goto retry;
> > >               if (r < 0)
> > >                       pfn = KVM_PFN_ERR_FAULT;
> > >       } else {
> > > -             if (async && vma_is_valid(vma, write_fault))
> > > -                     *async = true;
> > > -             pfn = KVM_PFN_ERR_FAULT;
> > > +             if ((foll->flags & FOLL_NOWAIT) &&
> > > +                 vma_is_valid(vma, foll->flags & FOLL_WRITE))
> > > +                     pfn = KVM_PFN_ERR_NEEDS_IO;
> > > +             else
> > > +                     pfn = KVM_PFN_ERR_FAULT;
> > >       }
> > >  exit:
> > >       mmap_read_unlock(current->mm);
> > >       return pfn;
> > >  }
> > >
> > > -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > > -                            bool atomic, bool interruptible, bool *async,
> > > -                            bool write_fault, bool *writable, hva_t *hva)
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
> > >  {
> > > -     unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> > > -
> > > -     if (hva)
> > > -             *hva = addr;
> > > +     foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> > > +                                   foll->flags & FOLL_WRITE);
> > >
> > > -     if (addr == KVM_HVA_ERR_RO_BAD) {
> > > -             if (writable)
> > > -                     *writable = false;
> > > +     if (foll->hva == KVM_HVA_ERR_RO_BAD)
> > >               return KVM_PFN_ERR_RO_FAULT;
> > > -     }
> > >  
> >
> > Can you explain why updating foll->writable = false (previously *writeable
> > = false) is omitted here?
> >
> > In the caller where the struct kvm_follow_pfn is initialized, e.g.
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized.
> > IIUC, they expect __kvm_follow_pfn() to update it and return .writable to
> > upper caller.
> >
> > As the one of the output, it would be better to initalize it either in the
> > caller or update it in __kvm_follow_pfn(). Or
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the
> > stack to the caller via bool *writable. It doesn't sound nice.  
> 
> Entries omitted from an initializer are initialized to zero, so
> .writable does get initialized in all of the patches in this series
> via designated initializers. Although you're right that explicitly
> setting it to false is a good idea, in case someday someone adds a
> caller that doesn't use an initializer when declaring its
> kvm_follow_pfn.
>

Nice trick and nice to know that. :) Agreed on improving readability and
preventing a risk from the caller. 

> -David




More information about the linux-arm-kernel mailing list