[PATCH v2 3/4] arm64, acpi, numa: NUMA support based on SRAT and SLIT

Lorenzo Pieralisi lorenzo.pieralisi at arm.com
Fri Dec 18 08:23:36 PST 2015


On Tue, Nov 17, 2015 at 11:55:32PM +0530, Ganapatrao Kulkarni wrote:

[...]

> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 7987763..555c4a5 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -42,6 +42,7 @@ arm64-obj-$(CONFIG_PCI)			+= pci.o
>  arm64-obj-$(CONFIG_ARMV8_DEPRECATED)	+= armv8_deprecated.o
>  arm64-obj-$(CONFIG_ACPI)		+= acpi.o
>  arm64-obj-$(CONFIG_OF_NUMA)		+= of_numa.o
> +arm64-obj-$(CONFIG_ACPI_NUMA)		+= acpi_numa.o

Isn't it better to merge ACPI and DT support in one file (a bit like
what we did for smp.c) to remove some of this iffdeffery ?

>  
>  obj-y					+= $(arm64-obj-y) vdso/
>  obj-m					+= $(arm64-obj-m)
> diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
> new file mode 100644
> index 0000000..8aee205
> --- /dev/null
> +++ b/arch/arm64/kernel/acpi_numa.c
> @@ -0,0 +1,215 @@
> +/*
> + * ACPI 5.1 based NUMA setup for ARM64
> + * Lots of code was borrowed from arch/x86/mm/srat.c
> + *
> + * Copyright 2004 Andi Kleen, SuSE Labs.
> + * Copyright (C) 2013-2014, Linaro Ltd.
> + *		Author: Hanjun Guo <hanjun.guo at linaro.org>
> + *
> + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
> + *
> + * Called from acpi_numa_init while reading the SRAT and SLIT tables.
> + * Assumes all memory regions belonging to a single proximity domain
> + * are in one chunk. Holes between them will be included in the node.
> + */
> +
> +#define pr_fmt(fmt) "ACPI: NUMA: " fmt
> +
> +#include <linux/acpi.h>
> +#include <linux/bitmap.h>
> +#include <linux/bootmem.h>
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/memblock.h>
> +#include <linux/mmzone.h>
> +#include <linux/module.h>
> +#include <linux/topology.h>
> +
> +#include <acpi/processor.h>
> +#include <asm/numa.h>
> +
> +int acpi_numa __initdata;
> +
> +static __init int setup_node(int pxm)
> +{
> +	return acpi_map_pxm_to_node(pxm);
> +}

This function is not that useful given how it is used in the patch.

> +
> +static __init void bad_srat(void)
> +{
> +	pr_err("SRAT not used.\n");
> +	acpi_numa = -1;
> +}
> +
> +static __init inline int srat_disabled(void)
> +{
> +	return acpi_numa < 0;
> +}
> +
> +/*
> + * Callback for SLIT parsing.
> + * It will get the distance information presented by SLIT
> + * and init the distance matrix of numa nodes
> + */
> +void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < slit->locality_count; i++) {
> +		const int from_node = pxm_to_node(i);
> +
> +		if (from_node == NUMA_NO_NODE)
> +			continue;
> +
> +		for (j = 0; j < slit->locality_count; j++) {
> +			const int to_node = pxm_to_node(j);
> +
> +			if (to_node == NUMA_NO_NODE)
> +				continue;
> +
> +			pr_info("SLIT: Distance[%d][%d] = %d\n",
> +					from_node, to_node,
> +					slit->entry[
> +					slit->locality_count * i + j]);
> +			numa_set_distance(from_node, to_node,
> +				slit->entry[slit->locality_count * i + j]);
> +		}
> +	}
> +}

This function is an x86 copy'n'paste. ia64 just requires this callback
to save a slit table pointer (that can be moved to generic code and it is
initdata anyway), so my question is, do we really need to duplicate it ?

> +static int __init get_mpidr_in_madt(int acpi_id, u64 *mpidr)
> +{

Looks familiar. I guess you can't reuse the code in drivers/acpi
(acpi_map_cpuid()) only because that implies permanent table mappings to be
in place and you need to call this function before acpi_gbl_permanent_mmap
is set ?

> +	unsigned long madt_end, entry;
> +	struct acpi_table_madt *madt;
> +	acpi_size tbl_size;
> +
> +	if (ACPI_FAILURE(acpi_get_table_with_size(ACPI_SIG_MADT, 0,
> +			(struct acpi_table_header **)&madt, &tbl_size)))
> +		return -ENODEV;
> +
> +	entry = (unsigned long)madt;
> +	madt_end = entry + madt->header.length;
> +
> +	/* Parse all entries looking for a match. */
> +	entry += sizeof(struct acpi_table_madt);
> +	while (entry + sizeof(struct acpi_subtable_header) < madt_end) {
> +		struct acpi_subtable_header *header =
> +			(struct acpi_subtable_header *)entry;
> +
> +		if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) {
> +			struct acpi_madt_generic_interrupt *gicc =
> +				container_of(header,
> +				struct acpi_madt_generic_interrupt, header);
> +
> +			if ((gicc->flags & ACPI_MADT_ENABLED) &&
> +			    (gicc->uid == acpi_id)) {
> +				*mpidr = gicc->arm_mpidr;
> +				early_acpi_os_unmap_memory(madt, tbl_size);
> +				return 0;
> +			}
> +		}
> +		entry += header->length;
> +	}
> +
> +	early_acpi_os_unmap_memory(madt, tbl_size);
> +	return -ENODEV;
> +}
> +
> +/* Callback for Proximity Domain -> ACPI processor UID mapping */
> +void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa)
> +{
> +	int pxm, node;
> +	u64 mpidr;
> +	static int cpus_in_srat;
> +
> +	if (srat_disabled())
> +		return;
> +
> +	if (pa->header.length < sizeof(struct acpi_srat_gicc_affinity)) {
> +		bad_srat();
> +		return;
> +	}
> +
> +	if (!(pa->flags & ACPI_SRAT_GICC_ENABLED))
> +		return;
> +
> +	if (cpus_in_srat >= NR_CPUS) {
> +		pr_warn_once("SRAT: cpu_to_node_map[%ld] is too small, may not be able to use all cpus\n",
> +			     NR_CPUS);
> +		return;
> +	}
> +
> +	pxm = pa->proximity_domain;
> +	node = setup_node(pxm);
> +
> +	if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) {
> +		pr_err("SRAT: Too many proximity domains %d\n", pxm);
> +		bad_srat();
> +		return;
> +	}
> +
> +	if (get_mpidr_in_madt(pa->acpi_processor_uid, &mpidr)) {
> +		pr_warn("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in MADT\n",
> +			pxm, pa->acpi_processor_uid);
> +		bad_srat();
> +		return;
> +	}
> +
> +	cpu_to_node_map[cpus_in_srat] = node;

This looks wrong. cpus_in_srat is a logical index, but I do not see why
it has to be sequential. You retrieve the mpidr for a given SRAT entry
and with that value you should retrieve the cpu_logical index that
corresponds to it (get_logical_index()), or maybe I am missing something
from the ACPI specs that enforce a SRAT entries ordering, on which we
should not rely upon anyway.

> +	node_set(node, numa_nodes_parsed);
> +	acpi_numa = 1;

What's the point if you are checking for < 0 in srat_disabled() ?

> +	cpus_in_srat++;
> +	pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d cpu %d\n",
> +			pxm, mpidr, node, cpus_in_srat);
> +}
> +
> +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
> +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
> +{
> +	u64 start, end;
> +	int node, pxm;
> +
> +	if (srat_disabled())
> +		return -EINVAL;
> +
> +	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
> +		bad_srat();
> +		return -EINVAL;
> +	}
> +
> +	start = ma->base_address;
> +	end = start + ma->length;
> +	pxm = ma->proximity_domain;
> +
> +	node = setup_node(pxm);
> +
> +	if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) {
> +		pr_err("SRAT: Too many proximity domains.\n");
> +		bad_srat();
> +		return -EINVAL;
> +	}
> +
> +	pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
> +		node, pxm,
> +		(unsigned long long) start, (unsigned long long) end - 1);
> +
> +	if (numa_add_memblk(node, start, (end - start)) < 0) {
> +		bad_srat();
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}

I do not see again any major changes compared to x86, numa_add_memblk()
has a different interface (size vs end) but apart from that it would
be nice to avoid rewriting the same code time and again.

> +
> +void __init acpi_numa_arch_fixup(void) { }

Sigh. How many of these useless callbacks are we forced to define ?

> +
> +int __init arm64_acpi_numa_init(void)
> +{
> +	int ret;
> +
> +	ret = acpi_numa_init();
> +	if (ret < 0)
> +		return ret;
> +
> +	return srat_disabled() ? -EINVAL : 0;
> +}
> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
> index 209c7a9..c2950fc 100644
> --- a/arch/arm64/mm/numa.c
> +++ b/arch/arm64/mm/numa.c
> @@ -17,6 +17,7 @@
>   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <linux/acpi.h>
>  #include <linux/bootmem.h>
>  #include <linux/ctype.h>
>  #include <linux/init.h>
> @@ -383,9 +384,13 @@ void __init arm64_numa_init(void)
>  	int ret = -ENODEV;
>  
>  #ifdef CONFIG_OF_NUMA
> -	if (!numa_off)
> +	if (!numa_off && acpi_disabled)
>  		ret = numa_init(arm64_of_numa_init);
>  #endif
> +#ifdef CONFIG_ACPI_NUMA
> +	if (!numa_off && !acpi_disabled)
> +		ret = numa_init(arm64_acpi_numa_init);
> +#endif

See my comment above on DT/ACPI, this ifdeffery does not look great.

Thanks,
Lorenzo

>  
>  	if (ret)
>  		numa_init(dummy_numa_init);
> -- 
> 1.8.1.4
> 



More information about the linux-arm-kernel mailing list