[RFC PATCH] hw/arm/virt: Add support for NUMA on ARM64

Mon Dec 8 05:49:36 PST 2014

On 2 December 2014 at 12:56, Shannon Zhao <zhaoshenglong at huawei.com> wrote:
> Add support for NUMA on ARM64. Tested successfully running a guest
> Linux kernel with the following patch applied:

I'm still hoping for review from somebody who better understands
how QEMU and NUMA should interact, but in the meantime some comments
at a code level:

>  hw/arm/boot.c |   25 ------------
>  hw/arm/virt.c |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 113 insertions(+), 32 deletions(-)
>
> diff --git a/hw/arm/boot.c b/hw/arm/boot.c
> index 0014c34..c20fee4 100644
> --- a/hw/arm/boot.c
> +++ b/hw/arm/boot.c
> @@ -335,7 +335,6 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
>  {
>      void *fdt = NULL;
>      int size, rc;
> -    uint32_t acells, scells;
>
>      if (binfo->dtb_filename) {
>          char *filename;
> @@ -369,30 +368,6 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
>          return 0;
>      }
>
> -    acells = qemu_fdt_getprop_cell(fdt, "/", "#address-cells");
> -    scells = qemu_fdt_getprop_cell(fdt, "/", "#size-cells");
> -    if (acells == 0 || scells == 0) {
> -        fprintf(stderr, "dtb file invalid (#address-cells or #size-cells 0)\n");
> -        goto fail;
> -    }
> -
> -    if (scells < 2 && binfo->ram_size >= (1ULL << 32)) {
> -        /* This is user error so deserves a friendlier error message
> -         * than the failure of setprop_sized_cells would provide
> -         */
> -        fprintf(stderr, "qemu: dtb file not compatible with "
> -                "RAM size > 4GB\n");
> -        goto fail;
> -    }
> -
> -    rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
> -                                      acells, binfo->loader_start,
> -                                      scells, binfo->ram_size);
> -    if (rc < 0) {
> -        fprintf(stderr, "couldn't set /memory/reg\n");
> -        goto fail;
> -    }
> -

This patchset seems to be moving the initialization of a lot of
the dtb from this generic code into the virt board. That doesn't
seem right to me -- why would NUMA support be specific to the
virt board? I would expect support for this to be in the generic
code (possibly controlled with a board option for "I support NUMA").
As it is your patch will break support for every other
board that uses device trees, because they rely on this code
which you've deleted here.

>      if (binfo->kernel_cmdline && *binfo->kernel_cmdline) {
>          rc = qemu_fdt_setprop_string(fdt, "/chosen", "bootargs",
>                                       binfo->kernel_cmdline);
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 78f618d..9d18a91 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -170,8 +170,6 @@ static void create_fdt(VirtBoardInfo *vbi)
>       * to fill in necessary properties later
>       */
>      qemu_fdt_add_subnode(fdt, "/chosen");
> -    qemu_fdt_add_subnode(fdt, "/memory");
> -    qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory");
>
>      /* Clock node, for the benefit of the UART. The kernel device tree
>       * binding documentation claims the PL011 node clock properties are
> @@ -235,6 +233,116 @@ static void fdt_add_psci_node(const VirtBoardInfo *vbi)
>      qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn);
>  }
>
> +static int virt_memory_init(MachineState *machine,
> +                            MemoryRegion *system_memory,
> +                            const VirtBoardInfo *vbi)
> +{
> +    MemoryRegion *ram = g_new(MemoryRegion, 1);
> +    CPUState *cpu;
> +    int min_cpu = 0, max_cpu = 0;
> +    int i, j, count, len;
> +    uint32_t acells, scells;
> +
> +    acells = qemu_fdt_getprop_cell(vbi->fdt, "/", "#address-cells");
> +    scells = qemu_fdt_getprop_cell(vbi->fdt, "/", "#size-cells");
> +    if (acells == 0 || scells == 0) {
> +        fprintf(stderr, "dtb file invalid (#address-cells or #size-cells 0)\n");
> +        goto fail;
> +    }
> +
> +    if (scells < 2 && machine->ram_size >= (1ULL << 32)) {
> +        /* This is user error so deserves a friendlier error message
> +         * than the failure of setprop_sized_cells would provide
> +         */
> +        fprintf(stderr, "qemu: dtb file not compatible with "
> +                "RAM size > 4GB\n");
> +        goto fail;
> +    }
> +
> +    memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram",
> +                                         machine->ram_size);
> +    memory_region_add_subregion(system_memory, vbi->memmap[VIRT_MEM].base, ram);
> +
> +    hwaddr mem_base = vbi->memmap[VIRT_MEM].base;
> +
> +    if (!nb_numa_nodes) {
> +        qemu_fdt_add_subnode(vbi->fdt, "/memory");
> +        qemu_fdt_setprop_string(vbi->fdt, "/memory", "device_type", "memory");
> +        qemu_fdt_setprop_sized_cells(vbi->fdt, "/memory", "reg",
> +                                      acells, mem_base,
> +                                      scells, machine->ram_size);
> +        return 0;
> +    }
> +
> +    qemu_fdt_add_subnode(vbi->fdt, "/numa-map");
> +    qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#address-cells", 0x2);
> +    qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#size-cells", 0x1);
> +    qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#node-count", 0x2);
> +
> +    uint64_t *mem_map = g_malloc0(nb_numa_nodes * sizeof(uint64_t) * 6);
> +    uint64_t *cpu_map = g_malloc0(nb_numa_nodes * sizeof(uint64_t) * 6);
> +    uint64_t *node_matrix = g_malloc0(nb_numa_nodes * nb_numa_nodes
> +                                      * sizeof(uint64_t) * 6);

g_new0() is usually better than g_malloc0(count * sizeof(thing));

> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        uint64_t buffer[6] = {1, 0x00000000, 1, mem_base, 1, i};
> +        char *nodename;
> +        nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
> +        qemu_fdt_add_subnode(vbi->fdt, nodename);
> +        qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", "memory");
> +        qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg",
> +                                      acells, mem_base,
> +                                      scells, numa_info[i].node_mem-1);
> +        memcpy(mem_map + 6 * i, buffer, 6 * sizeof(*buffer));

Why do we create a local buffer array and then immediately do nothing
with it but memcpy() it somewhere else? I suspect this code would
be clearer if you defined a type (possibly a struct) where you're
currently using raw uint64_t array[6]. Then you could make your
mem_map, cpu_map and node_matrix variables have more sensible types
than raw uint64_t*, and you could just set the right element in them
using C array syntax rather than these memcpy calls.

> +        mem_base += numa_info[i].node_mem;
> +        g_free(nodename);
> +    }
> +    qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map", "mem-map",
> +                        (nb_numa_nodes * 6) / 2, mem_map);

Lots of unnecessary hardcoded 6s here (and above and below).

> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        CPU_FOREACH(cpu) {
> +            if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
> +                if (cpu->cpu_index < min_cpu) {
> +                    min_cpu = cpu->cpu_index;
> +                }
> +                if (cpu->cpu_index > max_cpu) {
> +                    max_cpu = cpu->cpu_index;
> +                }
> +            }
> +        }
> +
> +        uint64_t buffer[6] = {1, min_cpu, 1, max_cpu, 1, i};
> +        memcpy(cpu_map + 6 * i, buffer, 6 * sizeof(*buffer));
> +        min_cpu = max_cpu + 1;
> +    }
> +
> +    qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map", "cpu-map",
> +                        (nb_numa_nodes * 6) / 2, cpu_map);
> +    count = 0;
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        for (j = 0; j < nb_numa_nodes; j++) {
> +            len = 20;
> +            if (i == j) {
> +                len = 10;
> +            }
> +            uint64_t buffer[6] = {1, i, 1, j, 1, len};
> +            memcpy(node_matrix + 6 * count, buffer, 6 * sizeof(*buffer));
> +            count++;
> +        }
> +    }
> +    qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map",
> +        "node-matrix", (nb_numa_nodes * nb_numa_nodes * 6) / 2, node_matrix);
> +
> +    g_free(mem_map);
> +    g_free(cpu_map);
> +    g_free(node_matrix);
> +
> +    return 0;
> +fail:
> +    return -1;
> +}
> +
>  static void fdt_add_timer_nodes(const VirtBoardInfo *vbi)
>  {
>      /* Note that on A15 h/w these interrupts are level-triggered,
> @@ -532,7 +640,6 @@ static void machvirt_init(MachineState *machine)
>      qemu_irq pic[NUM_IRQS];
>      MemoryRegion *sysmem = get_system_memory();
>      int n;
> -    MemoryRegion *ram = g_new(MemoryRegion, 1);
>      const char *cpu_model = machine->cpu_model;
>      VirtBoardInfo *vbi;
>
> @@ -585,10 +692,9 @@ static void machvirt_init(MachineState *machine)
>      fdt_add_cpu_nodes(vbi);
>      fdt_add_psci_node(vbi);
>
> -    memory_region_init_ram(ram, NULL, "mach-virt.ram", machine->ram_size,
> -                           &error_abort);
> -    vmstate_register_ram_global(ram);
> -    memory_region_add_subregion(sysmem, vbi->memmap[VIRT_MEM].base, ram);
> +    if (virt_memory_init(machine, sysmem, vbi) < 0) {
> +        exit(1);
> +    }
>
>      create_flash(vbi);
>
> --
> 1.7.1

thanks
-- PMM