[PATCH 07/10] mci: add support for discarding write blocks

Yann Sionneau ysionneau at kalrayinc.com
Tue Jul 30 02:23:04 PDT 2024


Hello Ahmad,

Le 7/30/24 à 09:19, Ahmad Fatoum a écrit :
> From: Ahmad Fatoum <ahmad at a3f.at>
>
> A common optimization for flashing is to skip gaps between partitions
> and only flash actual data. The problem with that is that data from
> the previous installation may persist and confuse later software, e.g.
> an existing barebox state partition not contained in the image may
> survive, when the expectation is that a new state is created.
>
> eMMCs can support three different commands for erasing data:
>
>    - Erase: This has the widest support, but works on only on the level
>      of erase groups that may span multiple write blocks.
>      The region reads as either '0' or '1' afterwards.
>
>    - Trim: New in eMMC v4.4. This erases write blocks.
>      The region reads as either '0' or '1' afterwards.
>
>    - Discard: New in eMMC v4.5. This discards write blocks. It's up to the
>      card what action if any is taken.
>      All or part of the data may remain accessible.
>
> All of these don't enforce a actual physical NAND erase operation.
> In case erasure does happen, it may happen later at a convenient time.
>
> All of them, except for discard guarantee that a fixed pattern (either
> all zeroes or all ones) will be read back after the erase operation
> concludes. Therefore let's use them in barebox to implement the erase
> command.
>
> The behavior of the erase command will be similar to the Linux
> blkdiscard -z command when the erase byte value is all-zeroes. In Linux
> blkdiscard -z is emulated with zero writes if the erased byte is 0xFF,
> but for barebox, the erase operation won't care whether it writes 0x00
> or 0xFF.
>
> Note that this is considerably slower than need be, because we don't
> erase multiple erase groups at once. Doing so could run into the
> send_cmd timeout of the host hardware or its drivers. The correct
> workaround is to calculate the duration of the erase operation and split
> up the erases, so we always stay below a specific erase operation
> duration. There's a comment that explains this and implementing it is
> left as future exercise.
>
> Signed-off-by: Ahmad Fatoum <a.fatoum at pengutronix.de>
> ---
>   drivers/mci/Kconfig    |   5 +
>   drivers/mci/mci-core.c | 393 ++++++++++++++++++++++++++++++++++++++---
>   include/mci.h          |  10 ++
>   3 files changed, 387 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/mci/Kconfig b/drivers/mci/Kconfig
> index 1e8c85643b9a..f760614725fa 100644
> --- a/drivers/mci/Kconfig
> +++ b/drivers/mci/Kconfig
> @@ -40,6 +40,11 @@ config MCI_WRITE
>   	default y
>   	select DISK_WRITE
>   
> +config MCI_ERASE
> +	bool "Support erasing MCI cards"
> +	depends on MCI_WRITE
> +	default y
> +
>   config MCI_MMC_BOOT_PARTITIONS
>   	bool "support MMC boot partitions"
>   	help
> diff --git a/drivers/mci/mci-core.c b/drivers/mci/mci-core.c
> index 3a5fb0330700..200b23ffb604 100644
> --- a/drivers/mci/mci-core.c
> +++ b/drivers/mci/mci-core.c
> @@ -20,6 +20,7 @@
>   #include <disks.h>
>   #include <of.h>
>   #include <linux/err.h>
> +#include <linux/log2.h>
>   #include <linux/sizes.h>
>   #include <dma.h>
>   
> @@ -168,6 +169,32 @@ static int mci_send_status(struct mci *mci, unsigned int *status)
>   	return ret;
>   }
>   
> +static int mci_app_sd_status(struct mci *mci, __be32 *ssr)
> +{
> +	int err;
> +	struct mci_cmd cmd;
> +	struct mci_data data;
> +
> +	cmd.cmdidx = MMC_CMD_APP_CMD;
> +	cmd.resp_type = MMC_RSP_R1;
> +	cmd.cmdarg = mci->rca << 16;
> +
> +	err = mci_send_cmd_retry(mci, &cmd, NULL, 4);
> +	if (err)
> +		return err;
> +
> +	cmd.cmdidx = SD_CMD_APP_SD_STATUS;
> +	cmd.resp_type = MMC_RSP_R1;
> +	cmd.cmdarg = 0;
> +
> +	data.dest = (u8 *)ssr;
> +	data.blocksize = 64;
> +	data.blocks = 1;
> +	data.flags = MMC_DATA_READ;
> +
> +	return mci_send_cmd_retry(mci, &cmd, &data, 3);
> +}
> +
>   static int mmc_switch_status_error(struct mci_host *host, u32 status)
>   {
>   	if (mmc_host_is_spi(host)) {
> @@ -286,6 +313,56 @@ static int mci_block_write(struct mci *mci, const void *src, int blocknum,
>   	return ret;
>   }
>   
> +/**
> + * Erase one or several blocks of data to the card
> + * @param mci_dev MCI instance
> + * @param from Block number to start erasing from
> + * @param to inclusive last block number to erase to
> + * @return Transaction status (0 on success)
> + */
> +static int mci_block_erase(struct mci *card, unsigned int from,
> +			   unsigned int to, unsigned int arg)
> +{
> +	struct mci_cmd cmd = {};
> +	int err;
> +
> +	if (!card->high_capacity) {
> +		from *= card->write_bl_len;
> +		to *= card->write_bl_len;
> +	}
> +
> +	cmd.cmdidx = IS_SD(card) ? SD_ERASE_WR_BLK_START : MMC_ERASE_GROUP_START;
> +	cmd.cmdarg = from;
> +	cmd.resp_type = MMC_RSP_R1;
> +	err = mci_send_cmd(card, &cmd, NULL);
> +	if (err)
> +		goto err_out;
> +
> +	memset(&cmd, 0, sizeof(struct mci_cmd));
> +	cmd.cmdidx = IS_SD(card) ? SD_ERASE_WR_BLK_END : MMC_ERASE_GROUP_END;
> +	cmd.cmdarg = to;
> +	cmd.resp_type = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
> +	err = mci_send_cmd(card, &cmd, NULL);
> +	if (err)
> +		goto err_out;
> +
> +	memset(&cmd, 0, sizeof(struct mci_cmd));
> +	cmd.cmdidx = MMC_ERASE;
> +	cmd.cmdarg = arg;
> +	cmd.resp_type = MMC_RSP_R1b;
> +
> +	err = mci_send_cmd(card, &cmd, NULL);
> +	if (err)
> +		goto err_out;
> +
> +	return 0;
> +
> +err_out:
> +	dev_err(&card->dev, "erase cmd %d error %d, status %#x\n",
> +		cmd.cmdidx, err, cmd.response[0]);
> +	return -EIO;
> +}
> +
>   /**
>    * Read one or several block(s) of data from the card
>    * @param mci MCI instance
> @@ -773,6 +850,49 @@ static int sd_switch(struct mci *mci, unsigned mode, unsigned group,
>   	return mci_send_cmd(mci, &cmd, &data);
>   }
>   
> +static int sd_read_ssr(struct mci *mci)
> +{
> +	static const unsigned int sd_au_size[] = {
> +		0,		SZ_16K / 512,		SZ_32K / 512,
> +		SZ_64K / 512,	SZ_128K / 512,		SZ_256K / 512,
> +		SZ_512K / 512,	SZ_1M / 512,		SZ_2M / 512,
> +		SZ_4M / 512,	SZ_8M / 512,		(SZ_8M + SZ_4M) / 512,
> +		SZ_16M / 512,	(SZ_16M + SZ_8M) / 512,	SZ_32M / 512,
> +		SZ_64M / 512,
> +	};
> +	__be32 *ssr;
> +	int err;
> +	unsigned int au, eo, et, es;
> +
> +	if (!IS_ENABLED(CONFIG_MCI_ERASE))
> +		return -ENOSYS;
> +
> +	ssr = dma_alloc(64);
> +
> +	err = mci_app_sd_status(mci, ssr);
> +	if (err)
> +		goto out;
> +
> +	au = (be32_to_cpu(ssr[2]) >> 12) & 0xF;
> +	if ((au <= 9) || (mci->version == SD_VERSION_3)) {
> +		mci->ssr.au = sd_au_size[au];
> +		es = (be32_to_cpu(ssr[3]) >> 24) & 0xFF;
> +		es |= (be32_to_cpu(ssr[2]) & 0xFF) << 8;
> +		et = (be32_to_cpu(ssr[3]) >> 18) & 0x3F;
> +		if (es && et) {
> +			eo = (be32_to_cpu(ssr[3]) >> 16) & 0x3;
> +			mci->ssr.erase_timeout = (et * 1000) / es;
> +			mci->ssr.erase_offset = eo * 1000;
> +		}
> +	} else {
> +		dev_dbg(&mci->dev, "invalid allocation unit size.\n");
> +	}
> +
> +out:
> +	dma_free(ssr);
> +	return err;
> +}
> +
>   /**
>    * Change transfer frequency for an SD card
>    * @param mci MCI instance
> @@ -845,6 +965,11 @@ static int sd_change_freq(struct mci *mci)
>   	if (mci->scr[0] & SD_DATA_4BIT)
>   		mci->card_caps |= MMC_CAP_4_BIT_DATA;
>   
> +	if (mci->scr[0] & SD_DATA_STAT_AFTER_ERASE)
> +		mci->erased_byte = 0xFF;
> +	else
> +		mci->erased_byte = 0x0;
> +
>   	/* Version 1.0 doesn't support switching */
>   	if (mci->version == SD_VERSION_1_0)
>   		return 0;
> @@ -1083,6 +1208,47 @@ static void mci_extract_block_lengths_from_csd(struct mci *mci)
>   		mci->write_bl_len, mci->read_bl_len);
>   }
>   
> +/**
> + * Extract erase group size
> + * @param mci MCI instance
> + */
> +static void mci_extract_erase_group_size(struct mci *mci)
> +{
> +	if (!IS_ENABLED(CONFIG_MCI_ERASE) ||
> +	    !(UNSTUFF_BITS(mci->csd, 84, 12) & CCC_ERASE))
> +		return;
> +
> +
> +	if (IS_SD(mci) && UNSTUFF_BITS(mci->csd, 126, 2) != 0) {
> +		/* For SD with csd_struct v1, erase group is always one sector */
> +		mci->erase_grp_size = 1;
> +	} else {
> +		if (mci->ext_csd[EXT_CSD_ERASE_GROUP_DEF] & 0x01) {
> +			/* Read out group size from ext_csd */
> +			mci->erase_grp_size = mci->ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE] * 1024;
> +		} else {
> +			/* Calculate the group size from the csd value. */
> +			int erase_gsz, erase_gmul;
> +
> +			erase_gsz = (mci->csd[2] & 0x00007c00) >> 10;
> +			erase_gmul = (mci->csd[2] & 0x000003e0) >> 5;
> +			mci->erase_grp_size = (erase_gsz + 1)
> +				* (erase_gmul + 1);
> +		}
> +
> +		if (mci->ext_csd[EXT_CSD_ERASED_MEM_CONT])
> +			mci->erased_byte = 0xFF;
> +		else
> +			mci->erased_byte = 0x0;
> +
> +		if (mci->ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] & EXT_CSD_SEC_FEATURE_TRIM_EN)
> +			mci->can_trim = true;
> +	}
> +
> +	dev_dbg(&mci->dev, "Erase group is %u sector(s). Trim %ssupported\n",
> +		mci->erase_grp_size, mci->can_trim ? "" : "not ");
> +}
> +
>   /**
>    * Extract card's capacitiy from the CSD
>    * @param mci MCI instance
> @@ -1229,6 +1395,10 @@ static int mci_startup_sd(struct mci *mci)
>   
>   	mci_set_clock(mci, mci->tran_speed);
>   
> +	err = sd_read_ssr(mci);
> +	if (err)
> +		dev_dbg(&mci->dev, "unable to read ssr: %pe\n", ERR_PTR(err));
> +
>   	return 0;
>   }
>   
> @@ -1700,6 +1870,7 @@ static int mci_startup(struct mci *mci)
>   	dev_info(&mci->dev, "detected %s card version %s\n", IS_SD(mci) ? "SD" : "MMC",
>   		mci_version_string(mci));
>   	mci_extract_card_capacity_from_csd(mci);
> +	mci_extract_erase_group_size(mci);
>   
>   	if (IS_SD(mci))
>   		err = mci_startup_sd(mci);
> @@ -1791,6 +1962,189 @@ static int mci_blk_part_switch(struct mci_part *part)
>   
>   /* ------------------ attach to the blocklayer --------------------------- */
>   
> +static int mci_sd_check_write(struct mci *mci, const char *op,
> +			      sector_t block, blkcnt_t num_blocks)
> +{
> +	struct mci_host *host = mci->host;
> +
> +	if (!host->disable_wp &&
> +	    host->ops.card_write_protected && host->ops.card_write_protected(host)) {
> +		dev_err(&mci->dev, "card write protected\n");
> +		return -EPERM;
> +	}
> +
> +	dev_dbg(&mci->dev, "%s: %s %llu block(s), starting at %llu\n",
> +		__func__, op, num_blocks, block);
> +
> +	if (mci->write_bl_len != SECTOR_SIZE) {
> +		dev_dbg(&mci->dev, "MMC/SD block size is not %d bytes (its %u bytes instead)\n",
> +				SECTOR_SIZE, mci->read_bl_len);
> +		return -EINVAL;
> +	}
> +
> +	/* size of the block number field in the MMC/SD command is 32 bit only */
> +	if (block > MAX_BUFFER_NUMBER) {
> +		dev_dbg(&mci->dev, "Cannot handle block number %llu. Too large!\n", block);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static unsigned int mmc_align_erase_size(struct mci *card,
> +					 sector_t *from,
> +					 sector_t *to,
> +					 blkcnt_t nr)
> +{
> +	unsigned int from_new = *from, to_new, nr_new = nr, rem;
> +
> +	/*
> +	 * When the 'card->erase_size' is power of 2, we can use round_up/down()
> +	 * to align the erase size efficiently.
> +	 */
> +	if (is_power_of_2(card->erase_grp_size)) {
> +		unsigned int temp = from_new;
> +
> +		from_new = round_up(temp, card->erase_grp_size);
> +		rem = from_new - temp;
> +
> +		if (nr_new > rem)
> +			nr_new -= rem;
> +		else
> +			return 0;
> +
> +		nr_new = round_down(nr_new, card->erase_grp_size);
> +	} else {
> +		rem = from_new % card->erase_grp_size;
> +		if (rem) {
> +			rem = card->erase_grp_size - rem;
> +			from_new += rem;
> +			if (nr_new > rem)
> +				nr_new -= rem;
> +			else
> +				return 0;
> +		}
> +
> +		rem = nr_new % card->erase_grp_size;
> +		if (rem)
> +			nr_new -= rem;
> +	}
> +
> +	if (nr_new == 0)
> +		return 0;
> +
> +	to_new = from_new + nr_new;
> +
> +	if (*to != to_new || *from != from_new)
> +		dev_warn(&card->dev, "Erase range changed to [0x%x-0x%x] because of %u sector erase group\n",
> +			 from_new, to_new, card->erase_grp_size);
> +
> +	*to = to_new;
> +	*from = from_new;
> +
> +	return nr_new;
> +}
> +
> +/**
> + * Erase a memory region
> + * @param blk All info about the block device we need
> + * @param block first block to erase
> + * @param num_blocks Number of blocks to erase
> + * @return 0 on success, anything else on failure
> + *
> + */
> +static int mci_sd_erase(struct block_device *blk, sector_t from,
> +			blkcnt_t blkcnt)
> +{
> +	struct mci_part *part = container_of(blk, struct mci_part, blk);
> +	struct mci *mci = part->mci;
> +	sector_t i = 0;
> +	unsigned arg;
> +	sector_t to = from + blkcnt;
> +	int rc;
> +
> +	mci_blk_part_switch(part);
> +
> +	rc = mci_sd_check_write(mci, "Erase", from, blkcnt);
> +	if (rc)
> +		return rc;
> +
> +	if (!mci->erase_grp_size)
> +		return -EOPNOTSUPP;
> +
> +	if (mci->can_trim) {
> +		arg = MMC_TRIM_ARG;
> +	} else {
> +		/* We don't use discard, as it doesn't guarantee a fixed value */
> +		arg = MMC_ERASE_ARG;
> +		blkcnt = mmc_align_erase_size(mci, &from, &to, blkcnt);
> +	}
> +
> +	if (blkcnt == 0)
> +		return 0;
> +
> +	if (to <= from)
> +		return -EINVAL;
> +
> +	/* 'from' and 'to' are inclusive */
> +	to -= 1;
> +
> +	while (i < blkcnt) {
> +		sector_t blk_r;
> +
> +		/* TODO: While it's possible to clear many erase groups at once
> +		 * and it greatly improves throughput, drivers need adjustment:
> +		 *
> +		 * Many drivers hardcode a maximal wait time before aborting
> +		 * the wait for R1b and returning -ETIMEDOUT. With long
> +		 * erases/trims, we are bound to run into this timeout, so for now
> +		 * we just split into suifficiently small erases that are unlikely

suifficiently -> sufficiently

> +		 * to trigger the time.
time -> timeout?
> +		 *
> +		 * What Linux does and what we should be doing in barebox is:
> +		 *
> +		 *  - add a struct mci_cmd::busy_timeout member that drivers should
> +		 *    use instead of hardcoding their own timeout delay. The busy
> +		 *    timeout length can be calculated by the MCI core after
> +		 *    consulting the appropriate CSD/EXT_CSD/SSR registers.
> +		 *
> +		 *  - add a struct mci_host::max_busy_timeout member, where drivers
> +		 *    can indicate the maximum timeout they are able to support.
> +		 *    The MCI core will never set a busy_timeout that exceeds this
> +		 *    value.
> +		 *
> +		 *  Example Samsung eMMC 8GTF4:
> +		 *
> +		 *    time erase /dev/mmc2.part_of_512m # 1024 trims
> +		 *    time: 2849ms
> +		 *
> +		 *    time erase /dev/mmc2.part_of_512m # single trim
> +		 *    time: 56ms
> +		 */
> +
> +		if (IS_SD(mci) && mci->ssr.au) {
> +			blk_r = ((blkcnt - i) > mci->ssr.au) ?
> +				mci->ssr.au : (blkcnt - i);
> +		} else {
> +			blk_r = ((blkcnt - i) > mci->erase_grp_size) ?
> +				mci->erase_grp_size : (blkcnt - i);
> +		}
> +
> +		rc =  mci_block_erase(mci, from, to, arg);
> +		if (rc)
> +			break;
> +
> +		/* Waiting for the ready status */
> +		rc = mci_poll_until_ready(mci, 1000 /* ms */);
> +		if (rc)
> +			break;
> +
> +		i += blk_r;
> +	}
> +
> +	return i == blkcnt ? 0 : rc;
> +}
> +
>   /**
>    * Write a chunk of sectors to media
>    * @param blk All info about the block device we need
> @@ -1806,7 +2160,6 @@ static int mci_sd_write(struct block_device *blk,
>   {
>   	struct mci_part *part = container_of(blk, struct mci_part, blk);
>   	struct mci *mci = part->mci;
> -	struct mci_host *host = mci->host;
>   	int rc;
>   	blkcnt_t max_req_block = num_blocks;
>   	blkcnt_t write_block;
> @@ -1816,26 +2169,9 @@ static int mci_sd_write(struct block_device *blk,
>   
>   	mci_blk_part_switch(part);
>   
> -	if (!host->disable_wp &&
> -	    host->ops.card_write_protected && host->ops.card_write_protected(host)) {
> -		dev_err(&mci->dev, "card write protected\n");
> -		return -EPERM;
> -	}
> -
> -	dev_dbg(&mci->dev, "%s: Write %llu block(s), starting at %llu\n",
> -		__func__, num_blocks, block);
> -
> -	if (mci->write_bl_len != SECTOR_SIZE) {
> -		dev_dbg(&mci->dev, "MMC/SD block size is not %d bytes (its %u bytes instead)\n",
> -				SECTOR_SIZE, mci->read_bl_len);
> -		return -EINVAL;
> -	}
> -
> -	/* size of the block number field in the MMC/SD command is 32 bit only */
> -	if (block > MAX_BUFFER_NUMBER) {
> -		dev_dbg(&mci->dev, "Cannot handle block number %llu. Too large!\n", block);
> -		return -EINVAL;
> -	}
> +	rc = mci_sd_check_write(mci, "Write", block, num_blocks);
> +	if (rc)
> +		return rc;
>   
>   	while (num_blocks) {
>   		write_block = min(num_blocks, max_req_block);
> @@ -2123,6 +2459,20 @@ static void mci_info(struct device *dev)
>   		return;
>   	printf("  Version: %s\n", mci_version_string(mci));
>   	printf("  Capacity: %u MiB\n", (unsigned)(mci->capacity >> 20));
> +	if (CONFIG_MCI_ERASE) {
> +		printf("  Erase support:");
> +		if (mci->can_trim)
> +			printf(" trim");
> +		if (mci->erase_grp_size) {
> +			printf(" erase(%u sector%s%s)", mci->ssr.au ?: mci->erase_grp_size,
> +			       mci->erase_grp_size > 1 ? "s" : "",
> +			       mci->ssr.au ? " AU" : "");
> +		}
> +		if (mci->can_trim || mci->erase_grp_size)
> +			printf(", erase value: 0x%02x\n", mci->erased_byte);
> +		else
> +			printf(" none\n");
> +	}
>   
>   	if (mci->high_capacity)
>   		printf("  High capacity card\n");
> @@ -2180,6 +2530,7 @@ static int mci_check_if_already_initialized(struct mci *mci)
>   static struct block_device_ops mci_ops = {
>   	.read = mci_sd_read,
>   	.write = IS_ENABLED(CONFIG_MCI_WRITE) ? mci_sd_write : NULL,
> +	.erase = IS_ENABLED(CONFIG_MCI_ERASE) ? mci_sd_erase : NULL,
>   };
>   
>   static int mci_set_boot(struct param_d *param, void *priv)
> diff --git a/include/mci.h b/include/mci.h
> index 610040937ee5..3bf1455a401c 100644
> --- a/include/mci.h
> +++ b/include/mci.h
> @@ -616,6 +616,12 @@ struct mci_part {
>   #define MMC_BLK_DATA_AREA_RPMB	(1<<3)
>   };
>   
> +struct sd_ssr {
> +	unsigned int au;                /* In sectors */
> +	unsigned int erase_timeout;     /* In milliseconds */
> +	unsigned int erase_offset;      /* In milliseconds */
> +};
> +
>   /** MMC/SD and interface instance information */
>   struct mci {
>   	struct mci_host *host;		/**< the host for this card */
> @@ -629,16 +635,20 @@ struct mci {
>   	unsigned short rca;	/**< relative card address */
>   	u8 sdio:1;              /**< card is a SDIO card */
>   	u8 high_capacity:1;	/**< high capacity card is connected (OCR -> OCR_HCS) */
> +	u8 can_trim:1;		/**< high capacity card is connected (OCR -> OCR_HCS) */
> +	u8 erased_byte;
>   	unsigned tran_speed;	/**< Maximum transfer speed */
>   	/** currently used data block length for read accesses */
>   	unsigned read_bl_len;
>   	/** currently used data block length for write accesses */
>   	unsigned write_bl_len;
> +	unsigned erase_grp_size;
>   	uint64_t capacity;	/**< Card's data capacity in bytes */
>   	int ready_for_use;	/** true if already probed */
>   	int dsr_imp;		/**< DSR implementation state from CSD */
>   	u8 *ext_csd;
>   	int probe;
> +	struct sd_ssr ssr;
>   	int bootpart;
>   	int boot_ack_enable;
>   







More information about the barebox mailing list