[PATCH] Revert "arm64: Enable perf events based hard lockup detector"

Sumit Garg sumit.garg at linaro.org
Wed Jan 13 01:30:01 EST 2021


Hi Will,

On Wed, 13 Jan 2021 at 03:49, Will Deacon <will at kernel.org> wrote:
>
> This reverts commit 367c820ef08082e68df8a3bc12e62393af21e4b5.
>
> lockup_detector_init() makes heavy use of per-cpu variables and must be
> called with preemption disabled. Usually, it's handled early during boot
> in kernel_ionit_freeable(), before SMP has been initialised.
>
> Since we do not know whether or not our PMU interrupt can be signalled
> as an NMI until considerably later in the boot process, the Arm PMU
> driver attempts to re-initialise the lockup detector off the back of a
> device_initcall(). Unfortunately, this is called from preemptible
> context and results in the following splat:

Can we consider the following fix (compile tested only) to bind the
call to lockup_detector_init() to a particular CPU instead of
reverting the hard lockup detection feature as a whole?

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 38bb07e..dba5676 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1248,6 +1248,12 @@ static struct platform_driver armv8_pmu_driver = {
        .probe          = armv8_pmu_device_probe,
 };

+static int lockup_detector_init_fn(void *data)
+{
+       lockup_detector_init();
+       return 0;
+}
+
 static int __init armv8_pmu_driver_init(void)
 {
        int ret;
@@ -1261,8 +1267,11 @@ static int __init armv8_pmu_driver_init(void)
         * Try to re-initialize lockup detector after PMU init in
         * case PMU events are triggered via NMIs.
         */
-       if (ret == 0 && arm_pmu_irq_is_nmi())
-               lockup_detector_init();
+       if (ret == 0 && arm_pmu_irq_is_nmi()) {
+               smp_call_on_cpu(get_cpu(), lockup_detector_init_fn, NULL,
+                               false);
+               put_cpu();
+       }

        return ret;
 }

-Sumit

>
>   | BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
>   | caller is debug_smp_processor_id+0x20/0x2c
>   | CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.10.0+ #276
>   | Hardware name: linux,dummy-virt (DT)
>   | Call trace:
>   |   dump_backtrace+0x0/0x3c0
>   |   show_stack+0x20/0x6c
>   |   dump_stack+0x2f0/0x42c
>   |   check_preemption_disabled+0x1cc/0x1dc
>   |   debug_smp_processor_id+0x20/0x2c
>   |   hardlockup_detector_event_create+0x34/0x18c
>   |   hardlockup_detector_perf_init+0x2c/0x134
>   |   watchdog_nmi_probe+0x18/0x24
>   |   lockup_detector_init+0x44/0xa8
>   |   armv8_pmu_driver_init+0x54/0x78
>   |   do_one_initcall+0x184/0x43c
>   |   kernel_init_freeable+0x368/0x380
>   |   kernel_init+0x1c/0x1cc
>   |   ret_from_fork+0x10/0x30
>
> Rather than bodge this with raw_smp_processor_id() or randomly disabling
> preemption, simply revert the culprit for now until we figure out how to
> do this properly.
>
> Cc: Sumit Garg <sumit.garg at linaro.org>
> Cc: Mark Rutland <mark.rutland at arm.com>
> Cc: Catalin Marinas <catalin.marinas at arm.com>
> Cc: Alexandru Elisei <alexandru.elisei at arm.com>
> Reported-by: Lecopzer Chen <lecopzer.chen at mediatek.com>
> Link: https://lore.kernel.org/r/20201221162249.3119-1-lecopzer.chen@mediatek.com
> Signed-off-by: Will Deacon <will at kernel.org>
> ---
>  arch/arm64/Kconfig             |  2 --
>  arch/arm64/kernel/perf_event.c | 41 ++--------------------------------
>  drivers/perf/arm_pmu.c         |  5 -----
>  include/linux/perf/arm_pmu.h   |  2 --
>  4 files changed, 2 insertions(+), 48 deletions(-)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 05e17351e4f3..f39568b28ec1 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -174,8 +174,6 @@ config ARM64
>         select HAVE_NMI
>         select HAVE_PATA_PLATFORM
>         select HAVE_PERF_EVENTS
> -       select HAVE_PERF_EVENTS_NMI if ARM64_PSEUDO_NMI && HW_PERF_EVENTS
> -       select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
>         select HAVE_PERF_REGS
>         select HAVE_PERF_USER_STACK_DUMP
>         select HAVE_REGS_AND_STACK_ACCESS_API
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index 38bb07eff872..3605f77ad4df 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -23,8 +23,6 @@
>  #include <linux/platform_device.h>
>  #include <linux/sched_clock.h>
>  #include <linux/smp.h>
> -#include <linux/nmi.h>
> -#include <linux/cpufreq.h>
>
>  /* ARMv8 Cortex-A53 specific event types. */
>  #define ARMV8_A53_PERFCTR_PREF_LINEFILL                                0xC2
> @@ -1250,21 +1248,10 @@ static struct platform_driver armv8_pmu_driver = {
>
>  static int __init armv8_pmu_driver_init(void)
>  {
> -       int ret;
> -
>         if (acpi_disabled)
> -               ret = platform_driver_register(&armv8_pmu_driver);
> +               return platform_driver_register(&armv8_pmu_driver);
>         else
> -               ret = arm_pmu_acpi_probe(armv8_pmuv3_init);
> -
> -       /*
> -        * Try to re-initialize lockup detector after PMU init in
> -        * case PMU events are triggered via NMIs.
> -        */
> -       if (ret == 0 && arm_pmu_irq_is_nmi())
> -               lockup_detector_init();
> -
> -       return ret;
> +               return arm_pmu_acpi_probe(armv8_pmuv3_init);
>  }
>  device_initcall(armv8_pmu_driver_init)
>
> @@ -1322,27 +1309,3 @@ void arch_perf_update_userpage(struct perf_event *event,
>         userpg->cap_user_time_zero = 1;
>         userpg->cap_user_time_short = 1;
>  }
> -
> -#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
> -/*
> - * Safe maximum CPU frequency in case a particular platform doesn't implement
> - * cpufreq driver. Although, architecture doesn't put any restrictions on
> - * maximum frequency but 5 GHz seems to be safe maximum given the available
> - * Arm CPUs in the market which are clocked much less than 5 GHz. On the other
> - * hand, we can't make it much higher as it would lead to a large hard-lockup
> - * detection timeout on parts which are running slower (eg. 1GHz on
> - * Developerbox) and doesn't possess a cpufreq driver.
> - */
> -#define SAFE_MAX_CPU_FREQ      5000000000UL // 5 GHz
> -u64 hw_nmi_get_sample_period(int watchdog_thresh)
> -{
> -       unsigned int cpu = smp_processor_id();
> -       unsigned long max_cpu_freq;
> -
> -       max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL;
> -       if (!max_cpu_freq)
> -               max_cpu_freq = SAFE_MAX_CPU_FREQ;
> -
> -       return (u64)max_cpu_freq * watchdog_thresh;
> -}
> -#endif
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 794a37d50853..cb2f55f450e4 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -726,11 +726,6 @@ static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
>         return per_cpu(hw_events->irq, cpu);
>  }
>
> -bool arm_pmu_irq_is_nmi(void)
> -{
> -       return has_nmi;
> -}
> -
>  /*
>   * PMU hardware loses all context when a CPU goes offline.
>   * When a CPU is hotplugged back in, since some hardware registers are
> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
> index bf7966776c55..505480217cf1 100644
> --- a/include/linux/perf/arm_pmu.h
> +++ b/include/linux/perf/arm_pmu.h
> @@ -163,8 +163,6 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn);
>  static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
>  #endif
>
> -bool arm_pmu_irq_is_nmi(void);
> -
>  /* Internal functions only for core arm_pmu code */
>  struct arm_pmu *armpmu_alloc(void);
>  struct arm_pmu *armpmu_alloc_atomic(void);
> --
> 2.30.0.284.gd98b1dd5eaa7-goog
>



More information about the linux-arm-kernel mailing list