[PATCH 0/3] warn and suppress irqflood

Pingfan Liu kernelfans at gmail.com
Fri Nov 6 00:53:59 EST 2020


On Wed, Oct 28, 2020 at 7:58 PM Thomas Gleixner <tglx at linutronix.de> wrote:
>
[...]
> ---
>  include/linux/irqdesc.h |    4 ++
>  kernel/irq/manage.c     |    3 +
>  kernel/irq/spurious.c   |   74 +++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 61 insertions(+), 20 deletions(-)
>
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -30,6 +30,8 @@ struct pt_regs;
>   * @tot_count:         stats field for non-percpu irqs
>   * @irq_count:         stats field to detect stalled irqs
>   * @last_unhandled:    aging timer for unhandled count
> + * @storm_count:       Counter for irq storm detection
> + * @storm_checked:     Timestamp for irq storm detection
>   * @irqs_unhandled:    stats field for spurious unhandled interrupts
>   * @threads_handled:   stats field for deferred spurious detection of threaded handlers
>   * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
> @@ -65,6 +67,8 @@ struct irq_desc {
>         unsigned int            tot_count;
>         unsigned int            irq_count;      /* For detecting broken IRQs */
>         unsigned long           last_unhandled; /* Aging timer for unhandled count */
> +       unsigned long           storm_count;
> +       unsigned long           storm_checked;
>         unsigned int            irqs_unhandled;
>         atomic_t                threads_handled;
>         int                     threads_handled_last;
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1581,6 +1581,9 @@ static int
>         if (!shared) {
>                 init_waitqueue_head(&desc->wait_for_threads);
>
> +               /* Take a timestamp for interrupt storm detection */
> +               desc->storm_checked = jiffies;
> +
>                 /* Setup the type (level, edge polarity) if configured: */
>                 if (new->flags & IRQF_TRIGGER_MASK) {
>                         ret = __irq_set_trigger(desc,
> --- a/kernel/irq/spurious.c
> +++ b/kernel/irq/spurious.c
> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
>  static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
>  static int irq_poll_cpu;
>  static atomic_t irq_poll_active;
> +static unsigned long irqstorm_limit __ro_after_init;
>
>  /*
>   * We wait here for a poller to finish.
> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
>   * (The other 100-of-100,000 interrupts may have been a correctly
>   *  functioning device sharing an IRQ with the failing one)
>   */
> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret,
> +                            bool storm)
>  {
>         unsigned int irq = irq_desc_get_irq(desc);
>         struct irqaction *action;
>         unsigned long flags;
>
> -       if (bad_action_ret(action_ret)) {
> -               printk(KERN_ERR "irq event %d: bogus return value %x\n",
> -                               irq, action_ret);
> -       } else {
> -               printk(KERN_ERR "irq %d: nobody cared (try booting with "
> +       if (!storm) {
> +               if (bad_action_ret(action_ret)) {
> +                       pr_err("irq event %d: bogus return value %x\n",
> +                              irq, action_ret);
> +               } else {
> +                       pr_err("irq %d: nobody cared (try booting with "
>                                 "the \"irqpoll\" option)\n", irq);
> +               }
>         }
>         dump_stack();
>         printk(KERN_ERR "handlers:\n");
> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>
>         if (count > 0) {
>                 count--;
> -               __report_bad_irq(desc, action_ret);
> +               __report_bad_irq(desc, action_ret, false);
>         }
>  }
>
> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
>         return action && (action->flags & IRQF_IRQPOLL);
>  }
>
> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret,
> +                             const char *reason, bool storm)
> +{
> +       __report_bad_irq(desc, action_ret, storm);
> +       pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
> +       desc->istate |= IRQS_SPURIOUS_DISABLED;
> +       desc->depth++;
> +       irq_disable(desc);
> +}
> +
> +/* Interrupt storm detector for runaway interrupts (handled or not). */
> +static bool irqstorm_detected(struct irq_desc *desc)
> +{
> +       unsigned long now = jiffies;
> +
> +       if (++desc->storm_count < irqstorm_limit) {
> +               if (time_after(now, desc->storm_checked + HZ)) {
> +                       desc->storm_count = 0;
> +                       desc->storm_checked = now;
> +               }
> +               return false;
> +       }
> +
> +       disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
> +       return true;
> +}
> +
>  #define SPURIOUS_DEFERRED      0x80000000
>
>  void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
>                         desc->irqs_unhandled -= ok;
>         }
>
> +       if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
> +               return;
> +
>         desc->irq_count++;
>         if (likely(desc->irq_count < 100000))
>                 return;
>
>         desc->irq_count = 0;
>         if (unlikely(desc->irqs_unhandled > 99900)) {
> -               /*
> -                * The interrupt is stuck
> -                */
> -               __report_bad_irq(desc, action_ret);
> -               /*
> -                * Now kill the IRQ
> -                */
> -               printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
> -               desc->istate |= IRQS_SPURIOUS_DISABLED;
> -               desc->depth++;
> -               irq_disable(desc);
> -
> +               disable_stuck_irq(desc, action_ret, "unhandled", false);
>                 mod_timer(&poll_spurious_irq_timer,
>                           jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
>         }
> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
>                                 "performance\n");
>         return 1;
>  }
> -
>  __setup("irqpoll", irqpoll_setup);
> +
> +static int __init irqstorm_setup(char *arg)
> +{
> +       int res = kstrtoul(arg, 0, &irqstorm_limit);
> +
> +       if (!res) {
> +               pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
> +                       irqstorm_limit);
> +       }
> +       return !!res;
> +}
> +__setup("irqstorm_limit", irqstorm_setup);
It should be
__setup("irqstorm_limit=", irqstorm_setup);

And I have tested this patch on the P9 machine, where I set the limit
to 70000. It works for kdump kernel.

Thanks,
Pingfan



More information about the kexec mailing list