[PATCH v6 12/26] KVM: arm/arm64: Move HYP IO VAs to the "idmap" range
James Morse
james.morse at arm.com
Thu Mar 15 12:09:44 PDT 2018
Hi Marc,
On 14/03/18 16:50, Marc Zyngier wrote:
> We so far mapped our HYP IO (which is essentially the GICv2 control
> registers) using the same method as for memory. It recently appeared
> that is a bit unsafe:
>
> We compute the HYP VA using the kern_hyp_va helper, but that helper
> is only designed to deal with kernel VAs coming from the linear map,
> and not from the vmalloc region... This could in turn cause some bad
> aliasing between the two, amplified by the upcoming VA randomisation.
>
> A solution is to come up with our very own basic VA allocator for
> MMIO. Since half of the HYP address space only contains a single
> page (the idmap), we have plenty to borrow from. Let's use the idmap
> as a base, and allocate downwards from it. GICv2 now lives on the
> other side of the great VA barrier.
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 9946f8a38b26..a9577ecc3e6a 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -43,6 +43,9 @@ static unsigned long hyp_idmap_start;
> static unsigned long hyp_idmap_end;
> static phys_addr_t hyp_idmap_vector;
>
> +static DEFINE_MUTEX(io_map_lock);
> +static unsigned long io_map_base;
> +
> #define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
> #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
>
> @@ -518,27 +521,37 @@ static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
> *
> * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
> * therefore contains either mappings in the kernel memory area (above
> - * PAGE_OFFSET), or device mappings in the vmalloc range (from
> - * VMALLOC_START to VMALLOC_END).
> + * PAGE_OFFSET), or device mappings in the idmap range.
> *
> - * boot_hyp_pgd should only map two pages for the init code.
> + * boot_hyp_pgd should only map the idmap range, and is only used in
> + * the extended idmap case.
> */
> void free_hyp_pgds(void)
> {
> + pgd_t *id_pgd;
> +
> mutex_lock(&kvm_hyp_pgd_mutex);
>
> + id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
> +
> + if (id_pgd) {
> + mutex_lock(&io_map_lock);
This takes kvm_hyp_pgd_mutex then io_map_lock ...
> + /* In case we never called hyp_mmu_init() */
> + if (!io_map_base)
> + io_map_base = hyp_idmap_start;
> + unmap_hyp_idmap_range(id_pgd, io_map_base,
> + hyp_idmap_start + PAGE_SIZE - io_map_base);
> + mutex_unlock(&io_map_lock);
> + }
> +
> if (boot_hyp_pgd) {
> - unmap_hyp_idmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
> free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
> boot_hyp_pgd = NULL;
> }
>
> if (hyp_pgd) {
> - unmap_hyp_idmap_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
> unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
> (uintptr_t)high_memory - PAGE_OFFSET);
> - unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START),
> - VMALLOC_END - VMALLOC_START);
>
> free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
> hyp_pgd = NULL;
> @@ -747,11 +761,43 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
> return 0;
> }
> + mutex_lock(&io_map_lock);
> - start = kern_hyp_va((unsigned long)*kaddr);
> - end = kern_hyp_va((unsigned long)*kaddr + size);
> - ret = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end,
> - __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
> + /*
> + * This assumes that we we have enough space below the idmap
> + * page to allocate our VAs. If not, the check below will
> + * kick. A potential alternative would be to detect that
> + * overflow and switch to an allocation above the idmap.
> + *
> + * The allocated size is always a multiple of PAGE_SIZE.
> + */
> + size = PAGE_ALIGN(size + offset_in_page(phys_addr));
> + base = io_map_base - size;
> +
> + /*
> + * Verify that BIT(VA_BITS - 1) hasn't been flipped by
> + * allocating the new area, as it would indicate we've
> + * overflowed the idmap/IO address range.
> + */
> + if ((base ^ io_map_base) & BIT(VA_BITS - 1)) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + if (__kvm_cpu_uses_extended_idmap())
> + pgd = boot_hyp_pgd;
> +
> + ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
> + base, base + size,
> + __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
And here we have io_map_lock, but __create_hyp_mappings takes kvm_hyp_pgd_mutex.
There is another example of this in create_hyp_exec_mappings, I think making
this the required order makes sense: if you are mapping to the KVM-IO region,
you take the io_map_lock before taking the pgd lock to write in your allocated
location. (free_hyp_pgds() is always going to need to take both).
Never going to happen, but it generates a lockdep splat[1].
Fixup diff[0] below fixes it for me.
> + if (ret)
> + goto out;
> +
> + *haddr = (void __iomem *)base + offset_in_page(phys_addr);
> + io_map_base = base;
> +
> +out:
> + mutex_unlock(&io_map_lock);
>
> if (ret) {
> iounmap(*kaddr);
Thanks,
James
[0]
--------------------------%<--------------------------
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 554ad5493e7d..f7642c96fc56 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -43,6 +43,7 @@ static unsigned long hyp_idmap_start;
static unsigned long hyp_idmap_end;
static phys_addr_t hyp_idmap_vector;
+/* Take io_map_lock before kvm_hyp_pgd_mutex */
static DEFINE_MUTEX(io_map_lock);
static unsigned long io_map_base;
@@ -530,18 +531,17 @@ void free_hyp_pgds(void)
{
pgd_t *id_pgd;
+ mutex_lock(&io_map_lock);
mutex_lock(&kvm_hyp_pgd_mutex);
id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
if (id_pgd) {
- mutex_lock(&io_map_lock);
/* In case we never called hyp_mmu_init() */
if (!io_map_base)
io_map_base = hyp_idmap_start;
unmap_hyp_idmap_range(id_pgd, io_map_base,
hyp_idmap_start + PAGE_SIZE - io_map_base);
- mutex_unlock(&io_map_lock);
}
if (boot_hyp_pgd) {
@@ -563,6 +563,7 @@ void free_hyp_pgds(void)
}
mutex_unlock(&kvm_hyp_pgd_mutex);
+ mutex_unlock(&io_map_lock);
}
static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
--------------------------%<--------------------------
[1]
[ 2.795711] kvm [1]: 8-bit VMID
[ 2.800456]
[ 2.801944] ======================================================
[ 2.808086] WARNING: possible circular locking dependency detected
[ 2.814230] 4.16.0-rc5-00027-gca4a12e92d2d-dirty #9471 Not tainted
[ 2.820371] ------------------------------------------------------
[ 2.826513] swapper/0/1 is trying to acquire lock:
[ 2.831274] (io_map_lock){+.+.}, at: [<00000000cf644f15>] free_hyp_pgds+0x44
/0x148
[ 2.838914]
[ 2.838914] but task is already holding lock:
[ 2.844713] (kvm_hyp_pgd_mutex){+.+.}, at: [<00000000e786face>] free_hyp_pgd
s+0x20/0x148
[ 2.852863]
[ 2.852863] which lock already depends on the new lock.
[ 2.852863]
[ 2.860995]
[ 2.860995] the existing dependency chain (in reverse order) is:
[ 2.868433]
[ 2.868433] -> #1 (kvm_hyp_pgd_mutex){+.+.}:
[ 2.874174] __mutex_lock+0x70/0x8d0
[ 2.878249] mutex_lock_nested+0x1c/0x28
[ 2.882671] __create_hyp_mappings+0x48/0x3e0
[ 2.887523] __create_hyp_private_mapping+0xa4/0xf8
[ 2.892893] create_hyp_exec_mappings+0x28/0x58
[ 2.897918] kvm_arch_init+0x524/0x6e8
[ 2.902167] kvm_init+0x20/0x2d8
[ 2.905897] arm_init+0x1c/0x28
[ 2.909542] do_one_initcall+0x38/0x128
[ 2.913884] kernel_init_freeable+0x1e0/0x284
[ 2.918737] kernel_init+0x10/0x100
[ 2.922726] ret_from_fork+0x10/0x18
[ 2.926797]
[ 2.926797] -> #0 (io_map_lock){+.+.}:
[ 2.932020] lock_acquire+0x6c/0xb0
[ 2.936008] __mutex_lock+0x70/0x8d0
[ 2.940082] mutex_lock_nested+0x1c/0x28
[ 2.944502] free_hyp_pgds+0x44/0x148
[ 2.948663] teardown_hyp_mode+0x34/0x90
[ 2.953084] kvm_arch_init+0x3b8/0x6e8
[ 2.957332] kvm_init+0x20/0x2d8
[ 2.961061] arm_init+0x1c/0x28
[ 2.964704] do_one_initcall+0x38/0x128
[ 2.969039] kernel_init_freeable+0x1e0/0x284
[ 2.973891] kernel_init+0x10/0x100
[ 2.977880] ret_from_fork+0x10/0x18
[ 2.981950]
[ 2.981950] other info that might help us debug this:
[ 2.981950]
[ 2.989910] Possible unsafe locking scenario:
[ 2.989910]
[ 2.995796] CPU0 CPU1
[ 3.000297] ---- ----
[ 3.004798] lock(kvm_hyp_pgd_mutex);
[ 3.008533] lock(io_map_lock);
[ 3.014252] lock(kvm_hyp_pgd_mutex);
[ 3.020488] lock(io_map_lock);
[ 3.023705]
[ 3.023705] *** DEADLOCK ***
[ 3.023705]
[ 3.029597] 1 lock held by swapper/0/1:
[ 3.033409] #0: (kvm_hyp_pgd_mutex){+.+.}, at: [<00000000e786face>] free_hy
p_pgds+0x20/0x148
[ 3.041993]
[ 3.041993] stack backtrace:
[ 3.046331] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.16.0-rc5-00027-gca4a1
2e92d2d-dirty #9471
[ 3.055062] Hardware name: ARM Juno development board (r1) (DT)
[ 3.060945] Call trace:
[ 3.063383] dump_backtrace+0x0/0x190
[ 3.067027] show_stack+0x14/0x20
[ 3.070326] dump_stack+0xac/0xe8
[ 3.073626] print_circular_bug.isra.19+0x1d4/0x2e0
[ 3.078476] __lock_acquire+0x19f4/0x1a50
[ 3.082464] lock_acquire+0x6c/0xb0
[ 3.085934] __mutex_lock+0x70/0x8d0
[ 3.089491] mutex_lock_nested+0x1c/0x28
[ 3.093394] free_hyp_pgds+0x44/0x148
[ 3.097037] teardown_hyp_mode+0x34/0x90
[ 3.100940] kvm_arch_init+0x3b8/0x6e8
[ 3.104670] kvm_init+0x20/0x2d8
[ 3.107882] arm_init+0x1c/0x28
[ 3.111007] do_one_initcall+0x38/0x128
[ 3.114823] kernel_init_freeable+0x1e0/0x284
[ 3.119158] kernel_init+0x10/0x100
[ 3.122628] ret_from_fork+0x10/0x18
More information about the linux-arm-kernel
mailing list