[PATCH ath-next v2] wifi: ath10k: shutdown driver when hardware is unreliable
Kang Yang
kang.yang at oss.qualcomm.com
Sun Jun 22 19:27:31 PDT 2025
In rare cases, ath10k may lose connection with the PCIe bus due to
some unknown reasons, which could further lead to system crashes during
resuming due to watchdog timeout:
ath10k_pci 0000:01:00.0: wmi command 20486 timeout, restarting hardware
ath10k_pci 0000:01:00.0: already restarting
ath10k_pci 0000:01:00.0: failed to stop WMI vdev 0: -11
ath10k_pci 0000:01:00.0: failed to stop vdev 0: -11
ieee80211 phy0: PM: **** DPM device timeout ****
Call Trace:
panic+0x125/0x315
dpm_watchdog_set+0x54/0x54
dpm_watchdog_handler+0x57/0x57
call_timer_fn+0x31/0x13c
At this point, all WMI commands will timeout and attempt to restart
device. So set a threshold for consecutive restart failures. If the
threshold is exceeded, consider the hardware is unreliable and all
ath10k operations should be skipped to avoid system crash.
fail_cont_count and pending_recovery are atomic variables, and
do not involve complex conditional logic. Therefore, even if recovery
check and reconfig complete are executed concurrently, the recovery
mechanism will not be broken.
Tested-on: QCA6174 hw3.2 PCI WLAN.RM.4.4.1-00288-QCARMSWPZ-1
Signed-off-by: Kang Yang <kang.yang at oss.qualcomm.com>
---
v2:
1. change u8 to u32 for recovery_count.
2. don't add a new line for debug log.
---
drivers/net/wireless/ath/ath10k/core.c | 49 +++++++++++++++++++++++---
drivers/net/wireless/ath/ath10k/core.h | 11 ++++--
drivers/net/wireless/ath/ath10k/mac.c | 7 +++-
drivers/net/wireless/ath/ath10k/wmi.c | 6 ++++
4 files changed, 64 insertions(+), 9 deletions(-)
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index fe3a8f4a1cc1..7686cf4292e0 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
* Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
*/
#include <linux/module.h>
@@ -2491,12 +2492,50 @@ static int ath10k_init_hw_params(struct ath10k *ar)
return 0;
}
+static bool ath10k_core_needs_recovery(struct ath10k *ar)
+{
+ long time_left;
+
+ /* Sometimes the recovery will fail and then the next all recovery fail,
+ * so avoid infinite recovery.
+ */
+ if (atomic_read(&ar->fail_cont_count) >= ATH10K_RECOVERY_MAX_FAIL_COUNT) {
+ ath10k_err(ar, "consecutive fail %d times, will shutdown driver!",
+ atomic_read(&ar->fail_cont_count));
+ ar->state = ATH10K_STATE_WEDGED;
+ return false;
+ }
+
+ ath10k_dbg(ar, ATH10K_DBG_BOOT, "total recovery count: %d", ++ar->recovery_count);
+
+ if (atomic_read(&ar->pending_recovery)) {
+ /* Sometimes it happened another recovery work before the previous one
+ * completed, then the second recovery work will destroy the previous
+ * one, thus below is to avoid that.
+ */
+ time_left = wait_for_completion_timeout(&ar->driver_recovery,
+ ATH10K_RECOVERY_TIMEOUT_HZ);
+ if (time_left) {
+ ath10k_warn(ar, "previous recovery succeeded, skip this!\n");
+ return false;
+ }
+
+ /* Record the continuous recovery fail count when recovery failed. */
+ atomic_inc(&ar->fail_cont_count);
+
+ /* Avoid having multiple recoveries at the same time. */
+ return false;
+ }
+
+ atomic_inc(&ar->pending_recovery);
+
+ return true;
+}
+
void ath10k_core_start_recovery(struct ath10k *ar)
{
- if (test_and_set_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags)) {
- ath10k_warn(ar, "already restarting\n");
+ if (!ath10k_core_needs_recovery(ar))
return;
- }
queue_work(ar->workqueue, &ar->restart_work);
}
@@ -2532,6 +2571,8 @@ static void ath10k_core_restart(struct work_struct *work)
struct ath10k *ar = container_of(work, struct ath10k, restart_work);
int ret;
+ reinit_completion(&ar->driver_recovery);
+
set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
/* Place a barrier to make sure the compiler doesn't reorder
@@ -2596,8 +2637,6 @@ static void ath10k_core_restart(struct work_struct *work)
if (ret)
ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d",
ret);
-
- complete(&ar->driver_recovery);
}
static void ath10k_core_set_coverage_class_work(struct work_struct *work)
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index 446dca74f06a..85e16c945b5c 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
* Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
*/
#ifndef _CORE_H_
@@ -87,6 +88,8 @@
IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
#define ATH10K_ITER_RESUME_FLAGS (IEEE80211_IFACE_ITER_RESUME_ALL |\
IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER)
+#define ATH10K_RECOVERY_TIMEOUT_HZ (5 * HZ)
+#define ATH10K_RECOVERY_MAX_FAIL_COUNT 4
struct ath10k;
@@ -865,9 +868,6 @@ enum ath10k_dev_flags {
/* Per Station statistics service */
ATH10K_FLAG_PEER_STATS,
- /* Indicates that ath10k device is during recovery process and not complete */
- ATH10K_FLAG_RESTARTING,
-
/* protected by conf_mutex */
ATH10K_FLAG_NAPI_ENABLED,
};
@@ -1211,6 +1211,11 @@ struct ath10k {
struct work_struct bundle_tx_work;
struct work_struct tx_complete_work;
+ atomic_t pending_recovery;
+ unsigned int recovery_count;
+ /* continuous recovery fail count */
+ atomic_t fail_cont_count;
+
/* cycle count is reported twice for each visited channel during scan.
* access protected by data_lock
*/
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 07fe05384cdf..9e6f294cd5b6 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -8156,7 +8156,12 @@ static void ath10k_reconfig_complete(struct ieee80211_hw *hw,
ath10k_info(ar, "device successfully recovered\n");
ar->state = ATH10K_STATE_ON;
ieee80211_wake_queues(ar->hw);
- clear_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags);
+
+ /* Clear recovery state. */
+ complete(&ar->driver_recovery);
+ atomic_set(&ar->fail_cont_count, 0);
+ atomic_set(&ar->pending_recovery, 0);
+
if (ar->hw_params.hw_restart_disconnect) {
list_for_each_entry(arvif, &ar->arvifs, list) {
if (arvif->is_up && arvif->vdev_type == WMI_VDEV_TYPE_STA)
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index df6a24f8f8d5..cb8ae751eb31 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
* Copyright (c) 2018-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
*/
#include <linux/skbuff.h>
@@ -1941,6 +1942,11 @@ int ath10k_wmi_cmd_send(struct ath10k *ar, struct sk_buff *skb, u32 cmd_id)
}
wait_event_timeout(ar->wmi.tx_credits_wq, ({
+ if (ar->state == ATH10K_STATE_WEDGED) {
+ ret = -ESHUTDOWN;
+ ath10k_dbg(ar, ATH10K_DBG_WMI,
+ "drop wmi command %d, hardware is wedged\n", cmd_id);
+ }
/* try to send pending beacons first. they take priority */
ath10k_wmi_tx_beacons_nowait(ar);
--
2.34.1
More information about the ath10k
mailing list