[PATCH]CFI fix of kernel deadlock caused by get_chip recursion

Thu Oct 18 10:34:53 EDT 2007

On Thu, 18 Oct 2007, Alexey Korolev wrote:

> Hi,
> 
> This patch solves kernel deadlock issue seen on JFFF2 simultaneous operations.
> Detailed investigation of the issue showed that the kernel deadlock is caused
> bytons of recursive get_chip calls.
> 
> MTD:CFI: This fix removes recursion in get_chip function to avoid possible
> deadlock situation.
> 
> Signed-off-by: Alexey Korolev <akorolev at infradead.org>

OK, now that you reversed the order for the two functions, the prototype 
for chip_ready() becomes redundent.  There is plenty of redundent 
prototypes in that file already, so whatever.

Acked-By: Nicolas Pitre <nico at cam.org>

> ---
> diff -aur a/drivers/mtd/chips/cfi_cmdset_0001.c
> b/drivers/mtd/chips/cfi_cmdset_0001.c
> --- a/drivers/mtd/chips/cfi_cmdset_0001.c	2007-09-25 14:51:06.000000000
> +0400
> +++ b/drivers/mtd/chips/cfi_cmdset_0001.c	2007-10-18 17:57:35.000000000
> +0400
> @@ -85,6 +85,7 @@
>  static void cfi_intelext_unpoint (struct mtd_info *mtd, u_char *addr, loff_t
> from,
>  			size_t len);
> 
> +static int chip_ready (struct map_info *map, struct flchip *chip, unsigned
> long adr, int mode);
>  static int get_chip(struct map_info *map, struct flchip *chip, unsigned long
> adr, int mode);
>  static void put_chip(struct map_info *map, struct flchip *chip, unsigned long
> adr);
>  #include "fwh_lock.h"
> @@ -641,73 +642,14 @@
>  /*
>   *  *********** CHIP ACCESS FUNCTIONS ***********
>   */
> -
> -static int get_chip(struct map_info *map, struct flchip *chip, unsigned long
> adr, int mode)
> +static int chip_ready (struct map_info *map, struct flchip *chip, unsigned
> long adr, int mode)
>  {
> +
>  	DECLARE_WAITQUEUE(wait, current);
>  	struct cfi_private *cfi = map->fldrv_priv;
>  	map_word status, status_OK = CMD(0x80), status_PWS = CMD(0x01);
> -	unsigned long timeo;
>  	struct cfi_pri_intelext *cfip = cfi->cmdset_priv;
> -
> - resettime:
> -	timeo = jiffies + HZ;
> - retry:
> -	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING || mode ==
> FL_OTP_WRITE)) {
> -		/*
> -		 * OK. We have possibility for contension on the write/erase
> -		 * operations which are global to the real chip and not per
> -		 * partition.  So let's fight it over in the partition which
> -		 * currently has authority on the operation.
> -		 *
> -		 * The rules are as follows:
> -		 *
> -		 * - any write operation must own shared->writing.
> -		 *
> -		 * - any erase operation must own _both_ shared->writing and
> -		 *   shared->erasing.
> -		 *
> -		 * - contension arbitration is handled in the owner's context.
> -		 *
> -		 * The 'shared' struct can be read and/or written only when
> -		 * its lock is taken.
> -		 */
> -		struct flchip_shared *shared = chip->priv;
> -		struct flchip *contender;
> -		spin_lock(&shared->lock);
> -		contender = shared->writing;
> -		if (contender && contender != chip) {
> -			/*
> -			 * The engine to perform desired operation on this
> -			 * partition is already in use by someone else.
> -			 * Let's fight over it in the context of the chip
> -			 * currently using it.  If it is possible to suspend,
> -			 * that other partition will do just that, otherwise
> -			 * it'll happily send us to sleep.  In any case, when
> -			 * get_chip returns success we're clear to go ahead.
> -			 */
> -			int ret = spin_trylock(contender->mutex);
> -			spin_unlock(&shared->lock);
> -			if (!ret)
> -				goto retry;
> -			spin_unlock(chip->mutex);
> -			ret = get_chip(map, contender, contender->start,
> mode);
> -			spin_lock(chip->mutex);
> -			if (ret) {
> -				spin_unlock(contender->mutex);
> -				return ret;
> -			}
> -			timeo = jiffies + HZ;
> -			spin_lock(&shared->lock);
> -			spin_unlock(contender->mutex);
> -		}
> -
> -		/* We now own it */
> -		shared->writing = chip;
> -		if (mode == FL_ERASING)
> -			shared->erasing = chip;
> -		spin_unlock(&shared->lock);
> -	}
> +	unsigned long timeo = jiffies + HZ;
> 
>  	switch (chip->state) {
> 
> @@ -722,16 +664,11 @@
>  			if (chip->priv && map_word_andequal(map, status,
> status_PWS, status_PWS))
>  				break;
> 
> -			if (time_after(jiffies, timeo)) {
> -				printk(KERN_ERR "%s: Waiting for chip to be
> ready timed out. Status %lx\n",
> -				       map->name, status.x[0]);
> -				return -EIO;
> -			}
>  			spin_unlock(chip->mutex);
>  			cfi_udelay(1);
>  			spin_lock(chip->mutex);
>  			/* Someone else might have been playing with it. */
> -			goto retry;
> +			return -EAGAIN;
>  		}
> 
>  	case FL_READY:
> @@ -806,8 +743,81 @@
>  		schedule();
>  		remove_wait_queue(&chip->wq, &wait);
>  		spin_lock(chip->mutex);
> -		goto resettime;
> +		return -EAGAIN;
> +	}
> +}
> +
> +static int get_chip(struct map_info *map, struct flchip *chip, unsigned long
> adr, int mode)
> +{
> +	int ret;
> +	struct cfi_private *cfi = map->fldrv_priv;
> +
> + retry:
> +	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING || mode ==
> FL_OTP_WRITE)) {
> +		/*
> +		 * OK. We have possibility for contension on the write/erase
> +		 * operations which are global to the real chip and not per
> +		 * partition.  So let's fight it over in the partition which
> +		 * currently has authority on the operation.
> +		 *
> +		 * The rules are as follows:
> +		 *
> +		 * - any write operation must own shared->writing.
> +		 *
> +		 * - any erase operation must own _both_ shared->writing and
> +		 *   shared->erasing.
> +		 *
> +		 * - contension arbitration is handled in the owner's context.
> +		 *
> +		 * The 'shared' struct can be read and/or written only when
> +		 * its lock is taken.
> +		 */
> +		struct flchip_shared *shared = chip->priv;
> +		struct flchip *contender;
> +		spin_lock(&shared->lock);
> +		contender = shared->writing;
> +		if (contender && contender != chip) {
> +			/*
> +			 * The engine to perform desired operation on this
> +			 * partition is already in use by someone else.
> +			 * Let's fight over it in the context of the chip
> +			 * currently using it.  If it is possible to suspend,
> +			 * that other partition will do just that, otherwise
> +			 * it'll happily send us to sleep.  In any case, when
> +			 * get_chip returns success we're clear to go ahead.
> +			 */
> +			ret = spin_trylock(contender->mutex);
> +			spin_unlock(&shared->lock);
> +			if (!ret)
> +				goto retry;
> +			spin_unlock(chip->mutex);
> +			ret = chip_ready(map, contender, contender->start,
> mode);
> +			spin_lock(chip->mutex);
> +
> +			if (ret == -EAGAIN) {
> +				spin_unlock(contender->mutex);
> +				goto retry;
> +			}
> +			if (ret) {
> +				spin_unlock(contender->mutex);
> +				return ret;
> +			}
> +			spin_lock(&shared->lock);
> +			spin_unlock(contender->mutex);
> +		}
> +
> +		/* We now own it */
> +		shared->writing = chip;
> +		if (mode == FL_ERASING)
> +			shared->erasing = chip;
> +		spin_unlock(&shared->lock);
>  	}
> +
> +	ret = chip_ready(map, chip, adr, mode);
> +	if (ret == -EAGAIN)
> +		goto retry;
> +
> +	return ret;
>  }
> 
>  static void put_chip(struct map_info *map, struct flchip *chip, unsigned long
> adr)
> 
> Thanks,
> Alexey
> 

Nicolas