[PATCH v4] PCI: rockchip: fix system hang up if activating CONFIG_DEBUG_SHIRQ

Bjorn Helgaas helgaas at kernel.org
Tue Aug 22 14:29:19 PDT 2017


Hi Shawn,

A few patch structuring comments below.

On Fri, Aug 11, 2017 at 03:19:11PM +0800, Shawn Lin wrote:
> With CONFIG_DEBUG_SHIRQ enabled, the irq tear down routine
> would still access the irq handler registed as a shard irq.
> Per the comment within the function of __free_irq, it says
> "It's a shared IRQ -- the driver ought to be prepared for
> an IRQ event to happen even now it's being freed". However
> when failing to probe the driver, it may disable the clock
> for accessing the register and the following check for shared
> irq state would call the irq handler which accesses the register
> w/o the clk enabled. That will hang the system forever.
> 
> With adding some dump_stack we could see how that happened.
> 
> calling  rockchip_pcie_driver_init+0x0/0x28 @ 1
> rockchip-pcie f8000000.pcie: no vpcie3v3 regulator found
> rockchip-pcie f8000000.pcie: no vpcie1v8 regulator found
> rockchip-pcie f8000000.pcie: no vpcie0v9 regulator found
> rockchip-pcie f8000000.pcie: PCIe link training gen1 timeout!
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc3-next-20170807-ARCH+ #189
> Hardware name: Firefly-RK3399 Board (DT)
> Call trace:
> [<ffff000008089bf0>] dump_backtrace+0x0/0x250
> [<ffff000008089eb0>] show_stack+0x20/0x28
> [<ffff000008c3313c>] dump_stack+0x90/0xb0
> [<ffff000008632ad4>] rockchip_pcie_read.isra.11+0x54/0x58
> [<ffff0000086334fc>] rockchip_pcie_client_irq_handler+0x30/0x1a0
> [<ffff00000813ce98>] __free_irq+0x1c8/0x2dc
> [<ffff00000813d044>] free_irq+0x44/0x74
> [<ffff0000081415fc>] devm_irq_release+0x24/0x2c
> [<ffff00000877429c>] release_nodes+0x1d8/0x30c
> [<ffff000008774838>] devres_release_all+0x3c/0x5c
> [<ffff00000876f19c>] driver_probe_device+0x244/0x494
> [<ffff00000876f50c>] __driver_attach+0x120/0x124
> [<ffff00000876cb80>] bus_for_each_dev+0x6c/0xac
> [<ffff00000876e984>] driver_attach+0x2c/0x34
> [<ffff00000876e3a4>] bus_add_driver+0x244/0x2b0
> [<ffff000008770264>] driver_register+0x70/0x110
> [<ffff0000087718b4>] platform_driver_register+0x60/0x6c
> [<ffff0000091eb108>] rockchip_pcie_driver_init+0x20/0x28
> [<ffff000008083a2c>] do_one_initcall+0xc8/0x130
> [<ffff0000091a0ea8>] kernel_init_freeable+0x1a0/0x238
> [<ffff000008c461cc>] kernel_init+0x18/0x108
> [<ffff0000080836c0>] ret_from_fork+0x10/0x50
> 
> In order to fix this, we remove all the clock-disabling from
> the error handle path and driver's remove function. And replying
> on the devm_add_action_or_reset to fire the clock-disabling at
> the appropriate time. Also split out rockchip_pcie_setup_irq
> and move requesting irq after enabling clks to avoid this kind
> of issues.
> 
> Signed-off-by: Shawn Lin <shawn.lin at rock-chips.com>
> 
> ---
> 
> Changes in v4:
> - split out rockchip_pcie_enable_clocks and reuse
>   rockchip_pcie_enable_clocks and rockchip_pcie_disable_clocks
>   for elsewhere suggested by Jeffy
> 
> Changes in v3:
> - check the return value of devm_add_action_or_reset and spilt out
>   rockchip_pcie_setup_irq in order to move requesting irq after
>   enabling clks.
> 
> Changes in v2:
> - use devm_add_action_or_reset to fix this ordering suggested by
>   Heiko and Jeffy. Thanks!
> 
>  drivers/pci/host/pcie-rockchip.c | 209 +++++++++++++++++++++------------------
>  1 file changed, 112 insertions(+), 97 deletions(-)
> 
> diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
> index 39aafe2..e8b90aa 100644
> --- a/drivers/pci/host/pcie-rockchip.c
> +++ b/drivers/pci/host/pcie-rockchip.c
> @@ -939,6 +939,51 @@ static int rockchip_pcie_get_phys(struct rockchip_pcie *rockchip)
>  	return 0;
>  }
>  

This patch is getting a little large and awkward.  Can you please
split it up into smaller pieces?

> +static int rockchip_pcie_setup_irq(struct rockchip_pcie *rockchip)

For example, this new rockchip_pcie_setup_irq() is extracted verbatim
from rockchip_pcie_parse_dt().  Can you make a trivial patch that
moves that code and calls it directly from rockchip_pcie_parse_dt()?
That won't fix anything, but it will be trivial to review and it will
make the subsequent patch that *does* fix something much easier to
understand because it won't be cluttered with the refactoring.

> +{
> +	int irq, err;
> +	struct device *dev = rockchip->dev;
> +	struct platform_device *pdev = to_platform_device(dev);
> +
> +	irq = platform_get_irq_byname(pdev, "sys");
> +	if (irq < 0) {
> +		dev_err(dev, "missing sys IRQ resource\n");
> +		return -EINVAL;
> +	}
> +
> +	err = devm_request_irq(dev, irq, rockchip_pcie_subsys_irq_handler,
> +			       IRQF_SHARED, "pcie-sys", rockchip);
> +	if (err) {
> +		dev_err(dev, "failed to request PCIe subsystem IRQ\n");
> +		return err;
> +	}
> +
> +	irq = platform_get_irq_byname(pdev, "legacy");
> +	if (irq < 0) {
> +		dev_err(dev, "missing legacy IRQ resource\n");
> +		return -EINVAL;
> +	}
> +
> +	irq_set_chained_handler_and_data(irq,
> +					 rockchip_pcie_legacy_int_handler,
> +					 rockchip);
> +
> +	irq = platform_get_irq_byname(pdev, "client");
> +	if (irq < 0) {
> +		dev_err(dev, "missing client IRQ resource\n");
> +		return -EINVAL;
> +	}
> +
> +	err = devm_request_irq(dev, irq, rockchip_pcie_client_irq_handler,
> +			       IRQF_SHARED, "pcie-client", rockchip);
> +	if (err) {
> +		dev_err(dev, "failed to request PCIe client IRQ\n");
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
>  /**
>   * rockchip_pcie_parse_dt - Parse Device Tree
>   * @rockchip: PCIe port information
> @@ -951,7 +996,6 @@ static int rockchip_pcie_parse_dt(struct rockchip_pcie *rockchip)
>  	struct platform_device *pdev = to_platform_device(dev);
>  	struct device_node *node = dev->of_node;
>  	struct resource *regs;
> -	int irq;
>  	int err;
>  
>  	regs = platform_get_resource_byname(pdev,
> @@ -1065,42 +1109,6 @@ static int rockchip_pcie_parse_dt(struct rockchip_pcie *rockchip)
>  		return PTR_ERR(rockchip->clk_pcie_pm);
>  	}
>  
> -	irq = platform_get_irq_byname(pdev, "sys");
> -	if (irq < 0) {
> -		dev_err(dev, "missing sys IRQ resource\n");
> -		return -EINVAL;
> -	}
> -
> -	err = devm_request_irq(dev, irq, rockchip_pcie_subsys_irq_handler,
> -			       IRQF_SHARED, "pcie-sys", rockchip);
> -	if (err) {
> -		dev_err(dev, "failed to request PCIe subsystem IRQ\n");
> -		return err;
> -	}
> -
> -	irq = platform_get_irq_byname(pdev, "legacy");
> -	if (irq < 0) {
> -		dev_err(dev, "missing legacy IRQ resource\n");
> -		return -EINVAL;
> -	}
> -
> -	irq_set_chained_handler_and_data(irq,
> -					 rockchip_pcie_legacy_int_handler,
> -					 rockchip);
> -
> -	irq = platform_get_irq_byname(pdev, "client");
> -	if (irq < 0) {
> -		dev_err(dev, "missing client IRQ resource\n");
> -		return -EINVAL;
> -	}
> -
> -	err = devm_request_irq(dev, irq, rockchip_pcie_client_irq_handler,
> -			       IRQF_SHARED, "pcie-client", rockchip);
> -	if (err) {
> -		dev_err(dev, "failed to request PCIe client IRQ\n");
> -		return err;
> -	}
> -
>  	rockchip->vpcie12v = devm_regulator_get_optional(dev, "vpcie12v");
>  	if (IS_ERR(rockchip->vpcie12v)) {
>  		if (PTR_ERR(rockchip->vpcie12v) == -EPROBE_DEFER)
> @@ -1371,6 +1379,57 @@ static int rockchip_pcie_wait_l2(struct rockchip_pcie *rockchip)
>  	return 0;
>  }
>  
> +static int rockchip_pcie_enable_clocks(
> +				struct rockchip_pcie *rockchip)

Similarly, this new rockchip_pcie_enable_clocks() factors out code
from rockchip_pcie_resume_noirq() and rockchip_pcie_probe().  That's
great, but by itself it doesn't fix a bug, and having it in this patch
makes it hard to identify the actual fix.  It could be moved to
another separate patch that *only* does the refactoring.

> +{
> +	struct device *dev = rockchip->dev;
> +	int err;
> +
> +	err = clk_prepare_enable(rockchip->aclk_pcie);
> +	if (err) {
> +		dev_err(dev, "unable to enable aclk_pcie clock\n");
> +		return err;
> +	}
> +
> +	err = clk_prepare_enable(rockchip->aclk_perf_pcie);
> +	if (err) {
> +		dev_err(dev, "unable to enable aclk_perf_pcie clock\n");
> +		goto err_aclk_perf_pcie;
> +	}
> +
> +	err = clk_prepare_enable(rockchip->hclk_pcie);
> +	if (err) {
> +		dev_err(dev, "unable to enable hclk_pcie clock\n");
> +		goto err_hclk_pcie;
> +	}
> +
> +	err = clk_prepare_enable(rockchip->clk_pcie_pm);
> +	if (err) {
> +		dev_err(dev, "unable to enable clk_pcie_pm clock\n");
> +		goto err_clk_pcie_pm;
> +	}
> +
> +	return 0;
> +
> +err_clk_pcie_pm:
> +	clk_disable_unprepare(rockchip->hclk_pcie);
> +err_hclk_pcie:
> +	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> +err_aclk_perf_pcie:
> +	clk_disable_unprepare(rockchip->aclk_pcie);
> +	return err;
> +}
> +
> +static void rockchip_pcie_disable_clocks(void *data)

This could probably be a third small refactoring patch.  When you get
done, the actual bug-fixing patch will be trivial and it will be
obvious exactly what you're changing.

> +{
> +	struct rockchip_pcie *rockchip = data;
> +
> +	clk_disable_unprepare(rockchip->clk_pcie_pm);
> +	clk_disable_unprepare(rockchip->hclk_pcie);
> +	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> +	clk_disable_unprepare(rockchip->aclk_pcie);
> +}
> +
>  static int __maybe_unused rockchip_pcie_suspend_noirq(struct device *dev)
>  {
>  	struct rockchip_pcie *rockchip = dev_get_drvdata(dev);
> @@ -1394,10 +1453,7 @@ static int __maybe_unused rockchip_pcie_suspend_noirq(struct device *dev)
>  		phy_exit(rockchip->phys[i]);
>  	}
>  
> -	clk_disable_unprepare(rockchip->clk_pcie_pm);
> -	clk_disable_unprepare(rockchip->hclk_pcie);
> -	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> -	clk_disable_unprepare(rockchip->aclk_pcie);
> +	rockchip_pcie_disable_clocks(rockchip);
>  
>  	if (!IS_ERR(rockchip->vpcie0v9))
>  		regulator_disable(rockchip->vpcie0v9);
> @@ -1418,21 +1474,9 @@ static int __maybe_unused rockchip_pcie_resume_noirq(struct device *dev)
>  		}
>  	}
>  
> -	err = clk_prepare_enable(rockchip->clk_pcie_pm);
> -	if (err)
> -		goto err_pcie_pm;
> -
> -	err = clk_prepare_enable(rockchip->hclk_pcie);
> -	if (err)
> -		goto err_hclk_pcie;
> -
> -	err = clk_prepare_enable(rockchip->aclk_perf_pcie);
> -	if (err)
> -		goto err_aclk_perf_pcie;
> -
> -	err = clk_prepare_enable(rockchip->aclk_pcie);
> +	err = rockchip_pcie_enable_clocks(rockchip);
>  	if (err)
> -		goto err_aclk_pcie;
> +		return err;
>  
>  	err = rockchip_pcie_init_port(rockchip);
>  	if (err)
> @@ -1449,14 +1493,7 @@ static int __maybe_unused rockchip_pcie_resume_noirq(struct device *dev)
>  	return 0;
>  
>  err_pcie_resume:
> -	clk_disable_unprepare(rockchip->aclk_pcie);
> -err_aclk_pcie:
> -	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> -err_aclk_perf_pcie:
> -	clk_disable_unprepare(rockchip->hclk_pcie);
> -err_hclk_pcie:
> -	clk_disable_unprepare(rockchip->clk_pcie_pm);
> -err_pcie_pm:
> +	rockchip_pcie_disable_clocks(rockchip);
>  	return err;
>  }
>  
> @@ -1490,34 +1527,26 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
>  	if (err)
>  		return err;
>  
> -	err = clk_prepare_enable(rockchip->aclk_pcie);
> -	if (err) {
> -		dev_err(dev, "unable to enable aclk_pcie clock\n");
> -		goto err_aclk_pcie;
> -	}
> +	err = rockchip_pcie_enable_clocks(rockchip);
> +	if (err)
> +		return err;
>  
> -	err = clk_prepare_enable(rockchip->aclk_perf_pcie);
> +	err = devm_add_action_or_reset(dev,
> +				       rockchip_pcie_disable_clocks,
> +				       rockchip);
>  	if (err) {
> -		dev_err(dev, "unable to enable aclk_perf_pcie clock\n");
> -		goto err_aclk_perf_pcie;
> -	}
> -
> -	err = clk_prepare_enable(rockchip->hclk_pcie);
> -	if (err) {
> -		dev_err(dev, "unable to enable hclk_pcie clock\n");
> -		goto err_hclk_pcie;
> +		dev_err(dev, "unable to add action or reset\n");
> +		return err;
>  	}
>  
> -	err = clk_prepare_enable(rockchip->clk_pcie_pm);
> -	if (err) {
> -		dev_err(dev, "unable to enable hclk_pcie clock\n");
> -		goto err_pcie_pm;
> -	}
> +	err = rockchip_pcie_setup_irq(rockchip);
> +	if (err)
> +		return err;
>  
>  	err = rockchip_pcie_set_vpcie(rockchip);
>  	if (err) {
>  		dev_err(dev, "failed to set vpcie regulator\n");
> -		goto err_set_vpcie;
> +		return err;
>  	}
>  
>  	err = rockchip_pcie_init_port(rockchip);
> @@ -1614,15 +1643,6 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
>  		regulator_disable(rockchip->vpcie1v8);
>  	if (!IS_ERR(rockchip->vpcie0v9))
>  		regulator_disable(rockchip->vpcie0v9);
> -err_set_vpcie:
> -	clk_disable_unprepare(rockchip->clk_pcie_pm);
> -err_pcie_pm:
> -	clk_disable_unprepare(rockchip->hclk_pcie);
> -err_hclk_pcie:
> -	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> -err_aclk_perf_pcie:
> -	clk_disable_unprepare(rockchip->aclk_pcie);
> -err_aclk_pcie:
>  	return err;
>  }
>  
> @@ -1644,11 +1664,6 @@ static int rockchip_pcie_remove(struct platform_device *pdev)
>  		phy_exit(rockchip->phys[i]);
>  	}
>  
> -	clk_disable_unprepare(rockchip->clk_pcie_pm);
> -	clk_disable_unprepare(rockchip->hclk_pcie);
> -	clk_disable_unprepare(rockchip->aclk_perf_pcie);
> -	clk_disable_unprepare(rockchip->aclk_pcie);
> -
>  	if (!IS_ERR(rockchip->vpcie12v))
>  		regulator_disable(rockchip->vpcie12v);
>  	if (!IS_ERR(rockchip->vpcie3v3))
> -- 
> 1.9.1
> 
> 



More information about the Linux-rockchip mailing list