[PATCH v4 07/17] watchdog/hardlockup: Move perf hardlockup checking/panic to common watchdog.c
Nicholas Piggin
npiggin at gmail.com
Thu May 4 19:58:22 PDT 2023
On Fri May 5, 2023 at 8:13 AM AEST, Douglas Anderson wrote:
> The perf hardlockup detector works by looking at interrupt counts and
> seeing if they change from run to run. The interrupt counts are
> managed by the common watchdog code via its watchdog_timer_fn().
>
> Currently the API between the perf detector and the common code is a
> function: is_hardlockup(). When the hard lockup detector sees that
> function return true then it handles printing out debug info and
> inducing a panic if necessary.
>
> Let's change the API a little bit in preparation for the buddy
> hardlockup detector. The buddy hardlockup detector wants to print
I think the name change is a gratuitous. Especially since it's now
static.
watchdog_hardlockup_ is a pretty long prefix too, hardlockup_
should be enough?
Seems okay otherwise though.
Thanks,
Nick
> nearly the same debug info and have nearly the same panic
> behavior. That means we want to move all that code to the common
> file. For now, the code in the common file will only be there if the
> perf hardlockup detector is enabled, but eventually it will be
> selected by a common config.
>
> Right now, this _just_ moves the code from the perf detector file to
> the common file and changes the names. It doesn't make the changes
> that the buddy hardlockup detector will need and doesn't do any style
> cleanups. A future patch will do cleanup to make it more obvious what
> changed.
>
> With the above, we no longer have any callers of is_hardlockup()
> outside of the "watchdog.c" file, so we can remove it from the header,
> make it static, move it to the same "#ifdef" block as our new
> watchdog_hardlockup_check(), and rename it to make it obvious it's
> just for hardlockup detectors. While doing this, it can be noted that
> even if no hardlockup detectors were configured the existing code used
> to still have the code for counting/checking "hrtimer_interrupts" even
> if the perf hardlockup detector wasn't configured. We didn't need to
> do that, so move all the "hrtimer_interrupts" counting to only be
> there if the perf hardlockup detector is configured as well.
>
> This change is expected to be a no-op.
>
> Signed-off-by: Douglas Anderson <dianders at chromium.org>
> ---
>
> Changes in v4:
> - ("Move perf hardlockup checking/panic ...") new for v4.
>
> include/linux/nmi.h | 5 ++-
> kernel/watchdog.c | 92 +++++++++++++++++++++++++++++++++---------
> kernel/watchdog_perf.c | 42 +------------------
> 3 files changed, 78 insertions(+), 61 deletions(-)
>
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 35d09d70f394..c6cb9bc5dc80 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -15,7 +15,6 @@
> void lockup_detector_init(void);
> void lockup_detector_soft_poweroff(void);
> void lockup_detector_cleanup(void);
> -bool is_hardlockup(void);
>
> extern int watchdog_user_enabled;
> extern int nmi_watchdog_user_enabled;
> @@ -88,6 +87,10 @@ extern unsigned int hardlockup_panic;
> static inline void hardlockup_detector_disable(void) {}
> #endif
>
> +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
> +void watchdog_hardlockup_check(struct pt_regs *regs);
> +#endif
> +
> #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
> # define NMI_WATCHDOG_SYSCTL_PERM 0644
> #else
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index c705a18b26bf..2d319cdf64b9 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -85,6 +85,78 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
>
> #endif /* CONFIG_HARDLOCKUP_DETECTOR */
>
> +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
> +
> +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
> +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
> +static DEFINE_PER_CPU(bool, hard_watchdog_warn);
> +static unsigned long hardlockup_allcpu_dumped;
> +
> +static bool watchdog_hardlockup_is_lockedup(void)
> +{
> + unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
> +
> + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
> + return true;
> +
> + __this_cpu_write(hrtimer_interrupts_saved, hrint);
> + return false;
> +}
> +
> +static void watchdog_hardlockup_interrupt_count(void)
> +{
> + __this_cpu_inc(hrtimer_interrupts);
> +}
> +
> +void watchdog_hardlockup_check(struct pt_regs *regs)
> +{
> + /* check for a hardlockup
> + * This is done by making sure our timer interrupt
> + * is incrementing. The timer interrupt should have
> + * fired multiple times before we overflow'd. If it hasn't
> + * then this is a good indication the cpu is stuck
> + */
> + if (watchdog_hardlockup_is_lockedup()) {
> + int this_cpu = smp_processor_id();
> +
> + /* only print hardlockups once */
> + if (__this_cpu_read(hard_watchdog_warn) == true)
> + return;
> +
> + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
> + this_cpu);
> + print_modules();
> + print_irqtrace_events(current);
> + if (regs)
> + show_regs(regs);
> + else
> + dump_stack();
> +
> + /*
> + * Perform all-CPU dump only once to avoid multiple hardlockups
> + * generating interleaving traces
> + */
> + if (sysctl_hardlockup_all_cpu_backtrace &&
> + !test_and_set_bit(0, &hardlockup_allcpu_dumped))
> + trigger_allbutself_cpu_backtrace();
> +
> + if (hardlockup_panic)
> + nmi_panic(regs, "Hard LOCKUP");
> +
> + __this_cpu_write(hard_watchdog_warn, true);
> + return;
> + }
> +
> + __this_cpu_write(hard_watchdog_warn, false);
> + return;
> +}
> +
> +#else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
> +
> +static inline void watchdog_hardlockup_interrupt_count(void) { }
> +
> +#endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */
> +
> /*
> * These functions can be overridden if an architecture implements its
> * own hardlockup detector.
> @@ -176,8 +248,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
> static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
> static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
> static DEFINE_PER_CPU(bool, softlockup_touch_sync);
> -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
> -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
> static unsigned long soft_lockup_nmi_warn;
>
> static int __init nowatchdog_setup(char *str)
> @@ -312,22 +382,6 @@ static int is_softlockup(unsigned long touch_ts,
> }
>
> /* watchdog detector functions */
> -bool is_hardlockup(void)
> -{
> - unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
> -
> - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
> - return true;
> -
> - __this_cpu_write(hrtimer_interrupts_saved, hrint);
> - return false;
> -}
> -
> -static void watchdog_interrupt_count(void)
> -{
> - __this_cpu_inc(hrtimer_interrupts);
> -}
> -
> static DEFINE_PER_CPU(struct completion, softlockup_completion);
> static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
>
> @@ -359,7 +413,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
> return HRTIMER_NORESTART;
>
> /* kick the hardlockup detector */
> - watchdog_interrupt_count();
> + watchdog_hardlockup_interrupt_count();
>
> /* kick the softlockup detector */
> if (completion_done(this_cpu_ptr(&softlockup_completion))) {
> diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
> index c3d8ceb149da..5f3651b87ee7 100644
> --- a/kernel/watchdog_perf.c
> +++ b/kernel/watchdog_perf.c
> @@ -20,13 +20,11 @@
> #include <asm/irq_regs.h>
> #include <linux/perf_event.h>
>
> -static DEFINE_PER_CPU(bool, hard_watchdog_warn);
> static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
> static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
> static DEFINE_PER_CPU(struct perf_event *, dead_event);
> static struct cpumask dead_events_mask;
>
> -static unsigned long hardlockup_allcpu_dumped;
> static atomic_t watchdog_cpus = ATOMIC_INIT(0);
>
> notrace void arch_touch_nmi_watchdog(void)
> @@ -122,45 +120,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
> if (!watchdog_check_timestamp())
> return;
>
> - /* check for a hardlockup
> - * This is done by making sure our timer interrupt
> - * is incrementing. The timer interrupt should have
> - * fired multiple times before we overflow'd. If it hasn't
> - * then this is a good indication the cpu is stuck
> - */
> - if (is_hardlockup()) {
> - int this_cpu = smp_processor_id();
> -
> - /* only print hardlockups once */
> - if (__this_cpu_read(hard_watchdog_warn) == true)
> - return;
> -
> - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
> - this_cpu);
> - print_modules();
> - print_irqtrace_events(current);
> - if (regs)
> - show_regs(regs);
> - else
> - dump_stack();
> -
> - /*
> - * Perform all-CPU dump only once to avoid multiple hardlockups
> - * generating interleaving traces
> - */
> - if (sysctl_hardlockup_all_cpu_backtrace &&
> - !test_and_set_bit(0, &hardlockup_allcpu_dumped))
> - trigger_allbutself_cpu_backtrace();
> -
> - if (hardlockup_panic)
> - nmi_panic(regs, "Hard LOCKUP");
> -
> - __this_cpu_write(hard_watchdog_warn, true);
> - return;
> - }
> -
> - __this_cpu_write(hard_watchdog_warn, false);
> - return;
> + watchdog_hardlockup_check(regs);
> }
>
> static int hardlockup_detector_event_create(void)
> --
> 2.40.1.521.gf1e218fcd8-goog
More information about the linux-arm-kernel
mailing list