[PATCH ath-next] wifi: ath10k: shutdown driver when hardware is unreliable

Kang Yang kang.yang at oss.qualcomm.com
Tue Jun 17 19:40:23 PDT 2025



On 6/13/2025 8:59 PM, Loic Poulain wrote:
> Hi Kang,
> 
> 
> On Tue, Jun 10, 2025 at 11:55 AM Kang Yang <kang.yang at oss.qualcomm.com> wrote:
>>
>> In rare cases, ath10k may lose connection with the PCIe bus due to
>> some unknown reasons, which could further lead to system crashes during
>> resuming due to watchdog timeout:
>>
>> ath10k_pci 0000:01:00.0: wmi command 20486 timeout, restarting hardware
>> ath10k_pci 0000:01:00.0: already restarting
>> ath10k_pci 0000:01:00.0: failed to stop WMI vdev 0: -11
>> ath10k_pci 0000:01:00.0: failed to stop vdev 0: -11
>> ieee80211 phy0: PM: **** DPM device timeout ****
>> Call Trace:
>>   panic+0x125/0x315
>>   dpm_watchdog_set+0x54/0x54
>>   dpm_watchdog_handler+0x57/0x57
>>   call_timer_fn+0x31/0x13c
>>
>> At this point, all WMI commands will timeout and attempt to restart
>> device. So set a threshold for consecutive restart failures. If the
>> threshold is exceeded, consider the hardware is unreliable and all
>> ath10k operations should be skipped to avoid system crash.
>>
>> fail_cont_count and pending_recovery are atomic variables, and
>> do not involve complex conditional logic. Therefore, even if recovery
>> check and reconfig complete are executed concurrently, the recovery
>> mechanism will not be broken.
>>
>> Tested-on: QCA6174 hw3.2 PCI WLAN.RM.4.4.1-00288-QCARMSWPZ-1
>>
>> Signed-off-by: Kang Yang <kang.yang at oss.qualcomm.com>
>> ---
>>   drivers/net/wireless/ath/ath10k/core.c | 50 +++++++++++++++++++++++---
>>   drivers/net/wireless/ath/ath10k/core.h | 11 ++++--
>>   drivers/net/wireless/ath/ath10k/mac.c  |  7 +++-
>>   drivers/net/wireless/ath/ath10k/wmi.c  |  6 ++++
>>   4 files changed, 65 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
>> index fe3a8f4a1cc1..f925a3cf9ebd 100644
>> --- a/drivers/net/wireless/ath/ath10k/core.c
>> +++ b/drivers/net/wireless/ath/ath10k/core.c
>> @@ -4,6 +4,7 @@
>>    * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
>>    * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
>>    * Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved.
>> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
>>    */
>>
>>   #include <linux/module.h>
>> @@ -2491,12 +2492,51 @@ static int ath10k_init_hw_params(struct ath10k *ar)
>>          return 0;
>>   }
>>
>> +static bool ath10k_core_needs_recovery(struct ath10k *ar)
>> +{
>> +       long time_left;
>> +
>> +       /* Sometimes the recovery will fail and then the next all recovery fail,
>> +        * so avoid infinite recovery.
>> +        */
>> +       if (atomic_read(&ar->fail_cont_count) >= ATH10K_RECOVERY_MAX_FAIL_COUNT) {
>> +               ath10k_err(ar, "consecutive fail %d times, will shutdown driver!",
>> +                          atomic_read(&ar->fail_cont_count));
>> +               ar->state = ATH10K_STATE_WEDGED;
>> +               return false;
>> +       }
>> +
>> +       ath10k_dbg(ar, ATH10K_DBG_BOOT, "total recovery count: %d",
> 
> You don't need a newline.
> 
>> +                  ++ar->recovery_count);
>> +
>> +       if (atomic_read(&ar->pending_recovery)) {
>> +               /* Sometimes it happened another recovery work before the previous one
>> +                * completed, then the second recovery work will destroy the previous
>> +                * one, thus below is to avoid that.
>> +                */
>> +               time_left = wait_for_completion_timeout(&ar->driver_recovery,
>> +                                                       ATH10K_RECOVERY_TIMEOUT_HZ);
>> +               if (time_left) {
>> +                       ath10k_warn(ar, "previous recovery succeeded, skip this!\n");
>> +                       return false;
>> +               }
>> +
>> +               /* Record the continuous recovery fail count when recovery failed. */
>> +               atomic_inc(&ar->fail_cont_count);
>> +
>> +               /* Avoid having multiple recoveries at the same time. */
>> +               return false;
>> +       }
>> +
>> +       atomic_inc(&ar->pending_recovery);
>> +
>> +       return true;
>> +}
>> +
>>   void ath10k_core_start_recovery(struct ath10k *ar)
>>   {
>> -       if (test_and_set_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags)) {
>> -               ath10k_warn(ar, "already restarting\n");
>> +       if (!ath10k_core_needs_recovery(ar))
>>                  return;
>> -       }
>>
>>          queue_work(ar->workqueue, &ar->restart_work);
>>   }
>> @@ -2532,6 +2572,8 @@ static void ath10k_core_restart(struct work_struct *work)
>>          struct ath10k *ar = container_of(work, struct ath10k, restart_work);
>>          int ret;
>>
>> +       reinit_completion(&ar->driver_recovery);
>> +
>>          set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
>>
>>          /* Place a barrier to make sure the compiler doesn't reorder
>> @@ -2596,8 +2638,6 @@ static void ath10k_core_restart(struct work_struct *work)
>>          if (ret)
>>                  ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d",
>>                              ret);
>> -
>> -       complete(&ar->driver_recovery);
>>   }
>>
>>   static void ath10k_core_set_coverage_class_work(struct work_struct *work)
>> diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
>> index 446dca74f06a..06ac95707531 100644
>> --- a/drivers/net/wireless/ath/ath10k/core.h
>> +++ b/drivers/net/wireless/ath/ath10k/core.h
>> @@ -4,6 +4,7 @@
>>    * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
>>    * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
>>    * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
>> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
>>    */
>>
>>   #ifndef _CORE_H_
>> @@ -87,6 +88,8 @@
>>                                    IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
>>   #define ATH10K_ITER_RESUME_FLAGS (IEEE80211_IFACE_ITER_RESUME_ALL |\
>>                                    IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
>> +#define ATH10K_RECOVERY_TIMEOUT_HZ                     (5 * HZ)
>> +#define ATH10K_RECOVERY_MAX_FAIL_COUNT                 4
>>
>>   struct ath10k;
>>
>> @@ -865,9 +868,6 @@ enum ath10k_dev_flags {
>>          /* Per Station statistics service */
>>          ATH10K_FLAG_PEER_STATS,
>>
>> -       /* Indicates that ath10k device is during recovery process and not complete */
>> -       ATH10K_FLAG_RESTARTING,
>> -
>>          /* protected by conf_mutex */
>>          ATH10K_FLAG_NAPI_ENABLED,
>>   };
>> @@ -1211,6 +1211,11 @@ struct ath10k {
>>          struct work_struct bundle_tx_work;
>>          struct work_struct tx_complete_work;
>>
>> +       atomic_t pending_recovery;
>> +       u8 recovery_count;
> 
> No reason to be a u8 IMO, use unsigned int.

OK, will change this.

> 
>> +       /* continuous recovery fail count */
>> +       atomic_t fail_cont_count;
>> +




More information about the ath10k mailing list