[PATCH] percpu: fix chunk range calculation

Dave Young dyoung at redhat.com
Sun Nov 20 20:45:51 EST 2011


On 11/19/2011 02:55 AM, Tejun Heo wrote:

> Percpu allocator recorded the cpus which map to the first and last
> units in pcpu_first/last_unit_cpu respectively and used them to
> determine the address range of a chunk - e.g. it assumed that the
> first unit has the lowest address in a chunk while the last unit has
> the highest address.
> 
> This simply isn't true.  Groups in a chunk can have arbitrary positive
> or negative offsets from the previous one and there is no guarantee
> that the first unit occupies the lowest offset while the last one the
> highest.
> 
> Fix it by actually comparing unit offsets to determine cpus occupying
> the lowest and highest offsets.  Also, rename pcu_first/last_unit_cpu
> to pcpu_low/high_unit_cpu to avoid confusion.
> 
> The chunk address range is used to flush cache on vmalloc area
> map/unmap and decide whether a given address is in the first chunk by
> per_cpu_ptr_to_phys() and the bug was discovered by invalid
> per_cpu_ptr_to_phys() translation for crash_note.
> 
> Kudos to Dave Young for tracking down the problem.


Tejun, thanks

Now that if addr is not in first trunk it must be in vmalloc area, below
logic should be right:
	if (in_first_chunk) {
		if (!is_vmalloc_addr(addr))
			return __pa(addr);
		else
			return page_to_phys(vmalloc_to_page(addr));
	} else
		if (!is_vmalloc_addr(addr)) /* not possible */
			return __pa(addr);
		else
			return page_to_phys(pcpu_addr_to_page(addr));

So how about just simply remove in first chunk checking to simplify the
code as followging:

phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
	if (!is_vmalloc_addr(addr))
		return __pa(addr);
	else
		return page_to_phys(pcpu_addr_to_page(addr));
}

> 
> Signed-off-by: Tejun Heo <tj at kernel.org>
> Reported-by: WANG Cong <xiyou.wangcong at gmail.com>
> Reported-by: Dave Young <dyoung at redhat.com>
> LKML-Reference: <4EC21F67.10905 at redhat.com>
> Cc: stable @kernel.org
> ---
> Heh, that's a stupid bug.  Can you please verify this fixes the
> problem?


I can confirm that the per cpu translation works with this patch, but I
have not managed to finished kdump test because kexec tools refuse
 to load the crash kernel. I need more debugging on kexec tools.

Tim, could you help to test if this patch works for your kdump case?

> 
> Thank you very much.
> 
>  mm/percpu-vm.c |   12 ++++++------
>  mm/percpu.c    |   34 ++++++++++++++++++++--------------
>  2 files changed, 26 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> index ea53496..bfad724 100644
> --- a/mm/percpu-vm.c
> +++ b/mm/percpu-vm.c
> @@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
>  				 int page_start, int page_end)
>  {
>  	flush_cache_vunmap(
> -		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
> -		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
> +		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
> +		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
>  }
>  
>  static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
> @@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
>  				      int page_start, int page_end)
>  {
>  	flush_tlb_kernel_range(
> -		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
> -		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
> +		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
> +		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
>  }
>  
>  static int __pcpu_map_pages(unsigned long addr, struct page **pages,
> @@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
>  				int page_start, int page_end)
>  {
>  	flush_cache_vmap(
> -		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
> -		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
> +		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
> +		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
>  }
>  
>  /**
> diff --git a/mm/percpu.c b/mm/percpu.c
> index bf80e55..93b5a7c 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
>  static int pcpu_nr_slots __read_mostly;
>  static size_t pcpu_chunk_struct_size __read_mostly;
>  
> -/* cpus with the lowest and highest unit numbers */
> -static unsigned int pcpu_first_unit_cpu __read_mostly;
> -static unsigned int pcpu_last_unit_cpu __read_mostly;
> +/* cpus with the lowest and highest unit addresses */
> +static unsigned int pcpu_low_unit_cpu __read_mostly;
> +static unsigned int pcpu_high_unit_cpu __read_mostly;
>  
>  /* the address of the first chunk which starts with the kernel static area */
>  void *pcpu_base_addr __read_mostly;
> @@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
>  {
>  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
>  	bool in_first_chunk = false;
> -	unsigned long first_start, first_end;
> +	unsigned long first_low, first_high;
>  	unsigned int cpu;
>  
>  	/*
> -	 * The following test on first_start/end isn't strictly
> +	 * The following test on unit_low/high isn't strictly
>  	 * necessary but will speed up lookups of addresses which
>  	 * aren't in the first chunk.
>  	 */
> -	first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
> -	first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
> -				    pcpu_unit_pages);
> -	if ((unsigned long)addr >= first_start &&
> -	    (unsigned long)addr < first_end) {
> +	first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
> +	first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
> +				     pcpu_unit_pages);
> +	if ((unsigned long)addr >= first_low &&
> +	    (unsigned long)addr < first_high) {
>  		for_each_possible_cpu(cpu) {
>  			void *start = per_cpu_ptr(base, cpu);
>  
> @@ -1233,7 +1233,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
>  
>  	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
>  		unit_map[cpu] = UINT_MAX;
> -	pcpu_first_unit_cpu = NR_CPUS;
> +
> +	pcpu_low_unit_cpu = NR_CPUS;
> +	pcpu_high_unit_cpu = NR_CPUS;
>  
>  	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
>  		const struct pcpu_group_info *gi = &ai->groups[group];
> @@ -1253,9 +1255,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
>  			unit_map[cpu] = unit + i;
>  			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
>  
> -			if (pcpu_first_unit_cpu == NR_CPUS)
> -				pcpu_first_unit_cpu = cpu;
> -			pcpu_last_unit_cpu = cpu;
> +			/* determine low/high unit_cpu */
> +			if (pcpu_low_unit_cpu == NR_CPUS ||
> +			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
> +				pcpu_low_unit_cpu = cpu;
> +			if (pcpu_high_unit_cpu == NR_CPUS ||
> +			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
> +				pcpu_high_unit_cpu = cpu;
>  		}
>  	}
>  	pcpu_nr_units = unit;
> 
> _______________________________________________
> kexec mailing list
> kexec at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



-- 
Thanks
Dave



More information about the kexec mailing list