[RFC PATCH v5 03/38] KVM: arm64: Implement the memslot lock/unlock functionality
Reiji Watanabe
reijiw at google.com
Mon Feb 14 23:46:38 PST 2022
Hi Alex,
On Wed, Nov 17, 2021 at 7:37 AM Alexandru Elisei
<alexandru.elisei at arm.com> wrote:
>
> Pin memory in the process address space and map it in the stage 2 tables as
> a result of userspace enabling the KVM_CAP_ARM_LOCK_USER_MEMORY_REGION
> capability; and unpin it from the process address space when the capability
> is used with the KVM_ARM_LOCK_USER_MEMORY_REGION_FLAGS_UNLOCK flag.
>
> The current implementation has two drawbacks which will be fixed in future
> patches:
>
> - The dcache maintenance is done when the memslot is locked, which means
> that it is possible that memory changes made by userspace after the ioctl
> completes won't be visible to a guest running with the MMU off.
>
> - Tag scrubbing is done when the memslot is locked. If the MTE capability
> is enabled after the ioctl, the guest will be able to access unsanitised
> pages. This is prevented by forbidding userspace to enable the MTE
> capability if any memslots are locked.
>
> Only PAGE_SIZE mappings are supported at stage 2.
>
> Signed-off-by: Alexandru Elisei <alexandru.elisei at arm.com>
> ---
> Documentation/virt/kvm/api.rst | 4 +-
> arch/arm64/include/asm/kvm_host.h | 11 ++
> arch/arm64/kvm/arm.c | 22 +++-
> arch/arm64/kvm/mmu.c | 204 ++++++++++++++++++++++++++++--
> 4 files changed, 226 insertions(+), 15 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 16aa59eae3d9..0ac12a730013 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6979,8 +6979,8 @@ write permissions are specified for a memslot which logs dirty pages.
>
> Enabling this capability causes the memory pinned when locking the memslot
> specified in args[0] to be unpinned, or, optionally, all memslots to be
> -unlocked. The IPA range is not unmapped from stage 2.
> ->>>>>>> 56641eee289e (KVM: arm64: Add lock/unlock memslot user API)
> +unlocked. The IPA range is not unmapped from stage 2. It is considered an error
> +to attempt to unlock a memslot which is not locked.
>
> 8. Other capabilities.
> ======================
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 733621e41900..7fd70ad90c16 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -99,7 +99,18 @@ struct kvm_s2_mmu {
> struct kvm_arch *arch;
> };
>
> +#define KVM_MEMSLOT_LOCK_READ (1 << 0)
> +#define KVM_MEMSLOT_LOCK_WRITE (1 << 1)
> +#define KVM_MEMSLOT_LOCK_MASK 0x3
> +
> +struct kvm_memory_slot_page {
> + struct list_head list;
> + struct page *page;
> +};
> +
> struct kvm_arch_memory_slot {
> + struct kvm_memory_slot_page pages;
> + u32 flags;
> };
>
> struct kvm_arch {
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index d49905d18cee..b9b8b43835e3 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -106,6 +106,25 @@ static int kvm_lock_user_memory_region_ioctl(struct kvm *kvm,
> }
> }
>
> +static bool kvm_arm_has_locked_memslots(struct kvm *kvm)
> +{
> + struct kvm_memslots *slots = kvm_memslots(kvm);
> + struct kvm_memory_slot *memslot;
> + bool has_locked_memslots = false;
> + int idx;
> +
> + idx = srcu_read_lock(&kvm->srcu);
> + kvm_for_each_memslot(memslot, slots) {
> + if (memslot->arch.flags & KVM_MEMSLOT_LOCK_MASK) {
> + has_locked_memslots = true;
> + break;
> + }
> + }
> + srcu_read_unlock(&kvm->srcu, idx);
> +
> + return has_locked_memslots;
> +}
> +
> int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> struct kvm_enable_cap *cap)
> {
> @@ -120,7 +139,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> break;
> case KVM_CAP_ARM_MTE:
> mutex_lock(&kvm->lock);
> - if (!system_supports_mte() || kvm->created_vcpus) {
> + if (!system_supports_mte() || kvm->created_vcpus ||
> + (kvm_arm_lock_memslot_supported() && kvm_arm_has_locked_memslots(kvm))) {
> r = -EINVAL;
> } else {
> r = 0;
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index f65bcbc9ae69..b0a8e61315e4 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -72,6 +72,11 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
> }
>
> +static bool memslot_is_locked(struct kvm_memory_slot *memslot)
> +{
> + return memslot->arch.flags & KVM_MEMSLOT_LOCK_MASK;
> +}
> +
> /**
> * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
> * @kvm: pointer to kvm structure.
> @@ -769,6 +774,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
> if (map_size == PAGE_SIZE)
> return true;
>
> + /* Allow only PAGE_SIZE mappings for locked memslots */
> + if (memslot_is_locked(memslot))
> + return false;
> +
> size = memslot->npages * PAGE_SIZE;
>
> gpa_start = memslot->base_gfn << PAGE_SHIFT;
> @@ -1296,6 +1305,159 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> return ret;
> }
>
> +static int try_rlimit_memlock(unsigned long npages)
> +{
> + unsigned long lock_limit;
> + bool has_lock_cap;
> + int ret = 0;
> +
> + has_lock_cap = capable(CAP_IPC_LOCK);
> + if (has_lock_cap)
> + goto out;
> +
> + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> + mmap_read_lock(current->mm);
> + if (npages + current->mm->locked_vm > lock_limit)
> + ret = -ENOMEM;
> + mmap_read_unlock(current->mm);
> +
> +out:
> + return ret;
> +}
> +
> +static void unpin_memslot_pages(struct kvm_memory_slot *memslot, bool writable)
> +{
> + struct kvm_memory_slot_page *entry, *tmp;
> +
> + list_for_each_entry_safe(entry, tmp, &memslot->arch.pages.list, list) {
> + if (writable)
> + set_page_dirty_lock(entry->page);
> + unpin_user_page(entry->page);
> + kfree(entry);
> + }
> +}
Shouldn't this be done when the memslot is deleted ?
(Or should the locked memslot be prevented from deleting ?)
> +
> +static int lock_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot,
> + u64 flags)
> +{
> + struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
> + struct kvm_memory_slot_page *page_entry;
> + bool writable = flags & KVM_ARM_LOCK_MEM_WRITE;
> + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
> + struct vm_area_struct *vma;
> + unsigned long npages = memslot->npages;
> + unsigned int pin_flags = FOLL_LONGTERM;
> + unsigned long i, hva, ipa, mmu_seq;
> + int ret;
> +
> + ret = try_rlimit_memlock(npages);
Even if the memory for the hva described by the memslot is already
'locked' by mlock or etc, is this checking needed ?
> + if (ret)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&memslot->arch.pages.list);
> +
> + if (writable) {
> + prot |= KVM_PGTABLE_PROT_W;
> + pin_flags |= FOLL_WRITE;
The lock flag is just for stage 2 mapping, correct ?
I wonder if it is appropriate for KVM to set 'pin_flags', which is
passed to pin_user_pages(), based on the lock flag.
> + }
> +
> + hva = memslot->userspace_addr;
> + ipa = memslot->base_gfn << PAGE_SHIFT;
> +
> + mmu_seq = kvm->mmu_notifier_seq;
> + smp_rmb();
> +
> + for (i = 0; i < npages; i++) {
> + page_entry = kzalloc(sizeof(*page_entry), GFP_KERNEL);
> + if (!page_entry) {
> + unpin_memslot_pages(memslot, writable);
> + ret = -ENOMEM;
> + goto out_err;
Nit: It seems we can call unpin_memslot_pages() from 'out_err'
instead of calling it from each of the error cases.
> + }
> +
> + mmap_read_lock(current->mm);
> + ret = pin_user_pages(hva, 1, pin_flags, &page_entry->page, &vma);
> + if (ret != 1) {
> + mmap_read_unlock(current->mm);
> + unpin_memslot_pages(memslot, writable);
> + ret = -ENOMEM;
> + goto out_err;
> + }
> + if (kvm_has_mte(kvm)) {
> + if (vma->vm_flags & VM_SHARED) {
> + ret = -EFAULT;
> + } else {
> + ret = sanitise_mte_tags(kvm,
> + page_to_pfn(page_entry->page),
> + PAGE_SIZE);
> + }
> + if (ret) {
> + mmap_read_unlock(current->mm);
> + goto out_err;
> + }
> + }
> + mmap_read_unlock(current->mm);
> +
> + ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(kvm));
> + if (ret) {
> + unpin_memslot_pages(memslot, writable);
> + goto out_err;
> + }
> +
> + spin_lock(&kvm->mmu_lock);
> + if (mmu_notifier_retry(kvm, mmu_seq)) {
> + spin_unlock(&kvm->mmu_lock);
> + unpin_memslot_pages(memslot, writable);
> + ret = -EAGAIN;
> + goto out_err;
> + }
> +
> + ret = kvm_pgtable_stage2_map(pgt, ipa, PAGE_SIZE,
> + page_to_phys(page_entry->page),
> + prot, &cache);
> + spin_unlock(&kvm->mmu_lock);
> +
> + if (ret) {
> + kvm_pgtable_stage2_unmap(pgt, memslot->base_gfn << PAGE_SHIFT,
> + i << PAGE_SHIFT);
> + unpin_memslot_pages(memslot, writable);
> + goto out_err;
> + }
> + list_add(&page_entry->list, &memslot->arch.pages.list);
> +
> + hva += PAGE_SIZE;
> + ipa += PAGE_SIZE;
> + }
> +
> +
> + /*
> + * Even though we've checked the limit at the start, we can still exceed
> + * it if userspace locked other pages in the meantime or if the
> + * CAP_IPC_LOCK capability has been revoked.
> + */
> + ret = account_locked_vm(current->mm, npages, true);
> + if (ret) {
> + kvm_pgtable_stage2_unmap(pgt, memslot->base_gfn << PAGE_SHIFT,
> + npages << PAGE_SHIFT);
> + unpin_memslot_pages(memslot, writable);
> + goto out_err;
> + }
> +
> + memslot->arch.flags = KVM_MEMSLOT_LOCK_READ;
> + if (writable)
> + memslot->arch.flags |= KVM_MEMSLOT_LOCK_WRITE;
> +
> + kvm_mmu_free_memory_cache(&cache);
> +
> + return 0;
> +
> +out_err:
> + kvm_mmu_free_memory_cache(&cache);
> + return ret;
> +}
> +
> int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags)
> {
> struct kvm_memory_slot *memslot;
> @@ -1325,7 +1487,12 @@ int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags)
> goto out_unlock_slots;
> }
>
> - ret = -EINVAL;
> + if (memslot_is_locked(memslot)) {
> + ret = -EBUSY;
> + goto out_unlock_slots;
> + }
> +
> + ret = lock_memslot(kvm, memslot, flags);
>
> out_unlock_slots:
> mutex_unlock(&kvm->slots_lock);
> @@ -1335,11 +1502,22 @@ int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags)
> return ret;
> }
>
> +static void unlock_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
> +{
> + bool writable = memslot->arch.flags & KVM_MEMSLOT_LOCK_WRITE;
> + unsigned long npages = memslot->npages;
> +
> + unpin_memslot_pages(memslot, writable);
> + account_locked_vm(current->mm, npages, false);
> +
> + memslot->arch.flags &= ~KVM_MEMSLOT_LOCK_MASK;
> +}
What if the memslot was locked with read only but the memslot
has read/write permission set ? Shouldn't the stage 2 mapping
updated if KVM allows for the scenario ?
Thanks,
Reiji
> +
> int kvm_mmu_unlock_memslot(struct kvm *kvm, u64 slot, u64 flags)
> {
> bool unlock_all = flags & KVM_ARM_UNLOCK_MEM_ALL;
> struct kvm_memory_slot *memslot;
> - int ret;
> + int ret = 0;
>
> if (!unlock_all && slot >= KVM_MEM_SLOTS_NUM)
> return -EINVAL;
> @@ -1347,18 +1525,20 @@ int kvm_mmu_unlock_memslot(struct kvm *kvm, u64 slot, u64 flags)
> mutex_lock(&kvm->slots_lock);
>
> if (unlock_all) {
> - ret = -EINVAL;
> - goto out_unlock_slots;
> - }
> -
> - memslot = id_to_memslot(kvm_memslots(kvm), slot);
> - if (!memslot) {
> - ret = -EINVAL;
> - goto out_unlock_slots;
> + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) {
> + if (!memslot_is_locked(memslot))
> + continue;
> + unlock_memslot(kvm, memslot);
> + }
> + } else {
> + memslot = id_to_memslot(kvm_memslots(kvm), slot);
> + if (!memslot || !memslot_is_locked(memslot)) {
> + ret = -EINVAL;
> + goto out_unlock_slots;
> + }
> + unlock_memslot(kvm, memslot);
> }
>
> - ret = -EINVAL;
> -
> out_unlock_slots:
> mutex_unlock(&kvm->slots_lock);
> return ret;
> --
> 2.33.1
>
> _______________________________________________
> kvmarm mailing list
> kvmarm at lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
More information about the linux-arm-kernel
mailing list