[PATCH 0/3] warn and suppress irqflood

Sai Prakash Ranjan saiprakash.ranjan at codeaurora.org
Fri Jun 4 19:32:13 PDT 2021


Hi Thomas,

On 2021-03-02 13:15, Sai Prakash Ranjan wrote:
> Hi Thomas,
> 
>> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>> 
> 
> <snip>...
> 
>> Something like the completly untested below should work independent of
>> config options.
>> 
>> Thanks,
>> 
>>         tglx
>> ---
>>  include/linux/irqdesc.h |    4 ++
>>  kernel/irq/manage.c     |    3 +
>>  kernel/irq/spurious.c   |   74 
>> +++++++++++++++++++++++++++++++++++-------------
>>  3 files changed, 61 insertions(+), 20 deletions(-)
>> 
>> --- a/include/linux/irqdesc.h
>> +++ b/include/linux/irqdesc.h
>> @@ -30,6 +30,8 @@ struct pt_regs;
>>   * @tot_count:		stats field for non-percpu irqs
>>   * @irq_count:		stats field to detect stalled irqs
>>   * @last_unhandled:	aging timer for unhandled count
>> + * @storm_count:	Counter for irq storm detection
>> + * @storm_checked:	Timestamp for irq storm detection
>>   * @irqs_unhandled:	stats field for spurious unhandled interrupts
>>   * @threads_handled:	stats field for deferred spurious detection of 
>> threaded handlers
>>   * @threads_handled_last: comparator field for deferred spurious 
>> detection of theraded handlers
>> @@ -65,6 +67,8 @@ struct irq_desc {
>>  	unsigned int		tot_count;
>>  	unsigned int		irq_count;	/* For detecting broken IRQs */
>>  	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
>> +	unsigned long		storm_count;
>> +	unsigned long		storm_checked;
>>  	unsigned int		irqs_unhandled;
>>  	atomic_t		threads_handled;
>>  	int			threads_handled_last;
>> --- a/kernel/irq/manage.c
>> +++ b/kernel/irq/manage.c
>> @@ -1581,6 +1581,9 @@ static int
>>  	if (!shared) {
>>  		init_waitqueue_head(&desc->wait_for_threads);
>> 
>> +		/* Take a timestamp for interrupt storm detection */
>> +		desc->storm_checked = jiffies;
>> +
>>  		/* Setup the type (level, edge polarity) if configured: */
>>  		if (new->flags & IRQF_TRIGGER_MASK) {
>>  			ret = __irq_set_trigger(desc,
>> --- a/kernel/irq/spurious.c
>> +++ b/kernel/irq/spurious.c
>> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
>>  static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
>>  static int irq_poll_cpu;
>>  static atomic_t irq_poll_active;
>> +static unsigned long irqstorm_limit __ro_after_init;
>> 
>>  /*
>>   * We wait here for a poller to finish.
>> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
>>   * (The other 100-of-100,000 interrupts may have been a correctly
>>   *  functioning device sharing an IRQ with the failing one)
>>   */
>> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t 
>> action_ret)
>> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t 
>> action_ret,
>> +			     bool storm)
>>  {
>>  	unsigned int irq = irq_desc_get_irq(desc);
>>  	struct irqaction *action;
>>  	unsigned long flags;
>> 
>> -	if (bad_action_ret(action_ret)) {
>> -		printk(KERN_ERR "irq event %d: bogus return value %x\n",
>> -				irq, action_ret);
>> -	} else {
>> -		printk(KERN_ERR "irq %d: nobody cared (try booting with "
>> +	if (!storm) {
>> +		if (bad_action_ret(action_ret)) {
>> +			pr_err("irq event %d: bogus return value %x\n",
>> +			       irq, action_ret);
>> +		} else {
>> +			pr_err("irq %d: nobody cared (try booting with "
>>  				"the \"irqpoll\" option)\n", irq);
>> +		}
>>  	}
>>  	dump_stack();
>>  	printk(KERN_ERR "handlers:\n");
>> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>> 
>>  	if (count > 0) {
>>  		count--;
>> -		__report_bad_irq(desc, action_ret);
>> +		__report_bad_irq(desc, action_ret, false);
>>  	}
>>  }
>> 
>> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
>>  	return action && (action->flags & IRQF_IRQPOLL);
>>  }
>> 
>> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t 
>> action_ret,
>> +			      const char *reason, bool storm)
>> +{
>> +	__report_bad_irq(desc, action_ret, storm);
>> +	pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
>> +	desc->istate |= IRQS_SPURIOUS_DISABLED;
>> +	desc->depth++;
>> +	irq_disable(desc);
>> +}
>> +
>> +/* Interrupt storm detector for runaway interrupts (handled or not). 
>> */
>> +static bool irqstorm_detected(struct irq_desc *desc)
>> +{
>> +	unsigned long now = jiffies;
>> +
>> +	if (++desc->storm_count < irqstorm_limit) {
>> +		if (time_after(now, desc->storm_checked + HZ)) {
>> +			desc->storm_count = 0;
>> +			desc->storm_checked = now;
>> +		}
>> +		return false;
>> +	}
>> +
>> +	disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
>> +	return true;
>> +}
>> +
>>  #define SPURIOUS_DEFERRED	0x80000000
>> 
>>  void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
>> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
>>  			desc->irqs_unhandled -= ok;
>>  	}
>> 
>> +	if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
>> +		return;
>> +
>>  	desc->irq_count++;
>>  	if (likely(desc->irq_count < 100000))
>>  		return;
>> 
>>  	desc->irq_count = 0;
>>  	if (unlikely(desc->irqs_unhandled > 99900)) {
>> -		/*
>> -		 * The interrupt is stuck
>> -		 */
>> -		__report_bad_irq(desc, action_ret);
>> -		/*
>> -		 * Now kill the IRQ
>> -		 */
>> -		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
>> -		desc->istate |= IRQS_SPURIOUS_DISABLED;
>> -		desc->depth++;
>> -		irq_disable(desc);
>> -
>> +		disable_stuck_irq(desc, action_ret, "unhandled", false);
>>  		mod_timer(&poll_spurious_irq_timer,
>>  			  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
>>  	}
>> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
>>  				"performance\n");
>>  	return 1;
>>  }
>> -
>>  __setup("irqpoll", irqpoll_setup);
>> +
>> +static int __init irqstorm_setup(char *arg)
>> +{
>> +	int res = kstrtoul(arg, 0, &irqstorm_limit);
>> +
>> +	if (!res) {
>> +		pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
>> +			irqstorm_limit);
>> +	}
>> +	return !!res;
>> +}
>> +__setup("irqstorm_limit", irqstorm_setup);
> 
> This irq storm detection feature is very useful, any chance to get this 
> merged?
> We will be happy to test. People seem to be having their own copy of
> such feature
> out-of-tree [1].
> 
> [1]
> https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
> 

Any chance of having this useful debug feature in upstream kernel?

Thanks,
Sai

-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member
of Code Aurora Forum, hosted by The Linux Foundation



More information about the kexec mailing list