[LEDE-DEV] [PATCH v2 3/8] apm821xx: dw_dmac: backport fixes and cleanups from 4.7

Wed Jul 13 12:23:16 PDT 2016

This patch fixes the dw_dmac dma engine which is used
by the SATA controllers in the MyBook Live Series and WNDR4700.

The code was backported from the upstream kernel.
It can be dropped completely on 4.7+.

Signed-off-by: Christian Lamparter <chunkeey at gmail.com>
---
 ...dd-transfer-termination-synchronization-s.patch |  143 ++
 ...ore-Introduce-new-universal-API-to-reques.patch |  345 +++++
 ...dd-transfer-termination-synchronization-s.patch |  293 ++++
 .../patches-4.4/015-dmaengine-dw-fixed.patch       | 1522 ++++++++++++++++++++
 4 files changed, 2303 insertions(+)
 create mode 100644 target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch
 create mode 100644 target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch
 create mode 100644 target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch
 create mode 100644 target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch

diff --git a/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch b/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch
new file mode 100644
index 0000000..924f797
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/010-dmaengine-Add-transfer-termination-synchronization-s.patch
@@ -0,0 +1,143 @@
+From 7bd903c5ca47fde5ad52370a47776491813c772e Mon Sep 17 00:00:00 2001
+From: Peter Ujfalusi <peter.ujfalusi at ti.com>
+Date: Mon, 14 Dec 2015 22:47:39 +0200
+Subject: [PATCH 1/3] dmaengine: core: Move and merge the code paths using
+ private_candidate
+
+Channel matching with private_candidate() is used in two paths, the error
+checking is slightly different in them and they are duplicating code also.
+Move the code under find_candidate() to provide consistent execution and
+going to allow us to reuse this mode of channel lookup later.
+
+Signed-off-by: Peter Ujfalusi <peter.ujfalusi at ti.com>
+Reviewed-by: Andy Shevchenko <andy.shevchenko at gmail.com>
+Reviewed-by: Arnd Bergmann <arnd at arndb.de>
+Signed-off-by: Vinod Koul <vinod.koul at intel.com>
+---
+ drivers/dma/dmaengine.c | 81 +++++++++++++++++++++++++------------------------
+ 1 file changed, 42 insertions(+), 39 deletions(-)
+
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index f2cbff9..81a36fc 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -542,6 +542,42 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
+ 	return NULL;
+ }
+ 
++static struct dma_chan *find_candidate(struct dma_device *device,
++				       const dma_cap_mask_t *mask,
++				       dma_filter_fn fn, void *fn_param)
++{
++	struct dma_chan *chan = private_candidate(mask, device, fn, fn_param);
++	int err;
++
++	if (chan) {
++		/* Found a suitable channel, try to grab, prep, and return it.
++		 * We first set DMA_PRIVATE to disable balance_ref_count as this
++		 * channel will not be published in the general-purpose
++		 * allocator
++		 */
++		dma_cap_set(DMA_PRIVATE, device->cap_mask);
++		device->privatecnt++;
++		err = dma_chan_get(chan);
++
++		if (err) {
++			if (err == -ENODEV) {
++				pr_debug("%s: %s module removed\n", __func__,
++					 dma_chan_name(chan));
++				list_del_rcu(&device->global_node);
++			} else
++				pr_debug("%s: failed to get %s: (%d)\n",
++					 __func__, dma_chan_name(chan), err);
++
++			if (--device->privatecnt == 0)
++				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
++
++			chan = ERR_PTR(err);
++		}
++	}
++
++	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
++}
++
+ /**
+  * dma_get_slave_channel - try to get specific channel exclusively
+  * @chan: target channel
+@@ -580,7 +616,6 @@ struct dma_chan *dma_get_any_slave_channel(struct dma_device *device)
+ {
+ 	dma_cap_mask_t mask;
+ 	struct dma_chan *chan;
+-	int err;
+ 
+ 	dma_cap_zero(mask);
+ 	dma_cap_set(DMA_SLAVE, mask);
+@@ -588,23 +623,11 @@ struct dma_chan *dma_get_any_slave_channel(struct dma_device *device)
+ 	/* lock against __dma_request_channel */
+ 	mutex_lock(&dma_list_mutex);
+ 
+-	chan = private_candidate(&mask, device, NULL, NULL);
+-	if (chan) {
+-		dma_cap_set(DMA_PRIVATE, device->cap_mask);
+-		device->privatecnt++;
+-		err = dma_chan_get(chan);
+-		if (err) {
+-			pr_debug("%s: failed to get %s: (%d)\n",
+-				__func__, dma_chan_name(chan), err);
+-			chan = NULL;
+-			if (--device->privatecnt == 0)
+-				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
+-		}
+-	}
++	chan = find_candidate(device, &mask, NULL, NULL);
+ 
+ 	mutex_unlock(&dma_list_mutex);
+ 
+-	return chan;
++	return IS_ERR(chan) ? NULL : chan;
+ }
+ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
+ 
+@@ -621,35 +644,15 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ {
+ 	struct dma_device *device, *_d;
+ 	struct dma_chan *chan = NULL;
+-	int err;
+ 
+ 	/* Find a channel */
+ 	mutex_lock(&dma_list_mutex);
+ 	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+-		chan = private_candidate(mask, device, fn, fn_param);
+-		if (chan) {
+-			/* Found a suitable channel, try to grab, prep, and
+-			 * return it.  We first set DMA_PRIVATE to disable
+-			 * balance_ref_count as this channel will not be
+-			 * published in the general-purpose allocator
+-			 */
+-			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+-			device->privatecnt++;
+-			err = dma_chan_get(chan);
++		chan = find_candidate(device, mask, fn, fn_param);
++		if (!IS_ERR(chan))
++			break;
+ 
+-			if (err == -ENODEV) {
+-				pr_debug("%s: %s module removed\n",
+-					 __func__, dma_chan_name(chan));
+-				list_del_rcu(&device->global_node);
+-			} else if (err)
+-				pr_debug("%s: failed to get %s: (%d)\n",
+-					 __func__, dma_chan_name(chan), err);
+-			else
+-				break;
+-			if (--device->privatecnt == 0)
+-				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
+-			chan = NULL;
+-		}
++		chan = NULL;
+ 	}
+ 	mutex_unlock(&dma_list_mutex);
+ 
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch b/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch
new file mode 100644
index 0000000..0296714
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/011-dmaengine-core-Introduce-new-universal-API-to-reques.patch
@@ -0,0 +1,345 @@
+From a8135d0d79e9d0ad3a4ff494fceeaae838becf38 Mon Sep 17 00:00:00 2001
+From: Peter Ujfalusi <peter.ujfalusi at ti.com>
+Date: Mon, 14 Dec 2015 22:47:40 +0200
+Subject: [PATCH 2/3] dmaengine: core: Introduce new, universal API to request
+ a channel
+
+The two API function can cover most, if not all current APIs used to
+request a channel. With minimal effort dmaengine drivers, platforms and
+dmaengine user drivers can be converted to use the two function.
+
+struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
+
+To request any channel matching with the requested capabilities, can be
+used to request channel for memcpy, memset, xor, etc where no hardware
+synchronization is needed.
+
+struct dma_chan *dma_request_chan(struct device *dev, const char *name);
+To request a slave channel. The dma_request_chan() will try to find the
+channel via DT, ACPI or in case if the kernel booted in non DT/ACPI mode
+it will use a filter lookup table and retrieves the needed information from
+the dma_slave_map provided by the DMA drivers.
+This legacy mode needs changes in platform code, in dmaengine drivers and
+finally the dmaengine user drivers can be converted:
+
+For each dmaengine driver an array of DMA device, slave and the parameter
+for the filter function needs to be added:
+
+static const struct dma_slave_map da830_edma_map[] = {
+	{ "davinci-mcasp.0", "rx", EDMA_FILTER_PARAM(0, 0) },
+	{ "davinci-mcasp.0", "tx", EDMA_FILTER_PARAM(0, 1) },
+	{ "davinci-mcasp.1", "rx", EDMA_FILTER_PARAM(0, 2) },
+	{ "davinci-mcasp.1", "tx", EDMA_FILTER_PARAM(0, 3) },
+	{ "davinci-mcasp.2", "rx", EDMA_FILTER_PARAM(0, 4) },
+	{ "davinci-mcasp.2", "tx", EDMA_FILTER_PARAM(0, 5) },
+	{ "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 14) },
+	{ "spi_davinci.0", "tx", EDMA_FILTER_PARAM(0, 15) },
+	{ "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 16) },
+	{ "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 17) },
+	{ "spi_davinci.1", "rx", EDMA_FILTER_PARAM(0, 18) },
+	{ "spi_davinci.1", "tx", EDMA_FILTER_PARAM(0, 19) },
+};
+
+This information is going to be needed by the dmaengine driver, so
+modification to the platform_data is needed, and the driver map should be
+added to the pdata of the DMA driver:
+
+da8xx_edma0_pdata.slave_map = da830_edma_map;
+da8xx_edma0_pdata.slavecnt = ARRAY_SIZE(da830_edma_map);
+
+The DMA driver then needs to configure the needed device -> filter_fn
+mapping before it registers with dma_async_device_register() :
+
+ecc->dma_slave.filter_map.map = info->slave_map;
+ecc->dma_slave.filter_map.mapcnt = info->slavecnt;
+ecc->dma_slave.filter_map.fn = edma_filter_fn;
+
+When neither DT or ACPI lookup is available the dma_request_chan() will
+try to match the requester's device name with the filter_map's list of
+device names, when a match found it will use the information from the
+dma_slave_map to get the channel with the dma_get_channel() internal
+function.
+
+Signed-off-by: Peter Ujfalusi <peter.ujfalusi at ti.com>
+Reviewed-by: Arnd Bergmann <arnd at arndb.de>
+Signed-off-by: Vinod Koul <vinod.koul at intel.com>
+---
+ Documentation/dmaengine/client.txt | 23 +++-------
+ drivers/dma/dmaengine.c            | 89 +++++++++++++++++++++++++++++++++-----
+ include/linux/dmaengine.h          | 51 +++++++++++++++++++---
+ 3 files changed, 127 insertions(+), 36 deletions(-)
+
+diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt
+index 11fb87f..4b04d89 100644
+--- a/Documentation/dmaengine/client.txt
++++ b/Documentation/dmaengine/client.txt
+@@ -22,25 +22,14 @@ The slave DMA usage consists of following steps:
+    Channel allocation is slightly different in the slave DMA context,
+    client drivers typically need a channel from a particular DMA
+    controller only and even in some cases a specific channel is desired.
+-   To request a channel dma_request_channel() API is used.
++   To request a channel dma_request_chan() API is used.
+ 
+    Interface:
+-	struct dma_chan *dma_request_channel(dma_cap_mask_t mask,
+-			dma_filter_fn filter_fn,
+-			void *filter_param);
+-   where dma_filter_fn is defined as:
+-	typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+-
+-   The 'filter_fn' parameter is optional, but highly recommended for
+-   slave and cyclic channels as they typically need to obtain a specific
+-   DMA channel.
+-
+-   When the optional 'filter_fn' parameter is NULL, dma_request_channel()
+-   simply returns the first channel that satisfies the capability mask.
+-
+-   Otherwise, the 'filter_fn' routine will be called once for each free
+-   channel which has a capability in 'mask'.  'filter_fn' is expected to
+-   return 'true' when the desired DMA channel is found.
++	struct dma_chan *dma_request_chan(struct device *dev, const char *name);
++
++   Which will find and return the 'name' DMA channel associated with the 'dev'
++   device. The association is done via DT, ACPI or board file based
++   dma_slave_map matching table.
+ 
+    A channel allocated via this interface is exclusive to the caller,
+    until dma_release_channel() is called.
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index 81a36fc..a094dbb 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -43,6 +43,7 @@
+ 
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
++#include <linux/platform_device.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+@@ -665,27 +666,73 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ }
+ EXPORT_SYMBOL_GPL(__dma_request_channel);
+ 
++static const struct dma_slave_map *dma_filter_match(struct dma_device *device,
++						    const char *name,
++						    struct device *dev)
++{
++	int i;
++
++	if (!device->filter.mapcnt)
++		return NULL;
++
++	for (i = 0; i < device->filter.mapcnt; i++) {
++		const struct dma_slave_map *map = &device->filter.map[i];
++
++		if (!strcmp(map->devname, dev_name(dev)) &&
++		    !strcmp(map->slave, name))
++			return map;
++	}
++
++	return NULL;
++}
++
+ /**
+- * dma_request_slave_channel_reason - try to allocate an exclusive slave channel
++ * dma_request_chan - try to allocate an exclusive slave channel
+  * @dev:	pointer to client device structure
+  * @name:	slave channel name
+  *
+  * Returns pointer to appropriate DMA channel on success or an error pointer.
+  */
+-struct dma_chan *dma_request_slave_channel_reason(struct device *dev,
+-						  const char *name)
++struct dma_chan *dma_request_chan(struct device *dev, const char *name)
+ {
++	struct dma_device *d, *_d;
++	struct dma_chan *chan = NULL;
++
+ 	/* If device-tree is present get slave info from here */
+ 	if (dev->of_node)
+-		return of_dma_request_slave_channel(dev->of_node, name);
++		chan = of_dma_request_slave_channel(dev->of_node, name);
+ 
+ 	/* If device was enumerated by ACPI get slave info from here */
+-	if (ACPI_HANDLE(dev))
+-		return acpi_dma_request_slave_chan_by_name(dev, name);
++	if (has_acpi_companion(dev) && !chan)
++		chan = acpi_dma_request_slave_chan_by_name(dev, name);
++
++	if (chan) {
++		/* Valid channel found or requester need to be deferred */
++		if (!IS_ERR(chan) || PTR_ERR(chan) == -EPROBE_DEFER)
++			return chan;
++	}
++
++	/* Try to find the channel via the DMA filter map(s) */
++	mutex_lock(&dma_list_mutex);
++	list_for_each_entry_safe(d, _d, &dma_device_list, global_node) {
++		dma_cap_mask_t mask;
++		const struct dma_slave_map *map = dma_filter_match(d, name, dev);
++
++		if (!map)
++			continue;
++
++		dma_cap_zero(mask);
++		dma_cap_set(DMA_SLAVE, mask);
+ 
+-	return ERR_PTR(-ENODEV);
++		chan = find_candidate(d, &mask, d->filter.fn, map->param);
++		if (!IS_ERR(chan))
++			break;
++	}
++	mutex_unlock(&dma_list_mutex);
++
++	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
+ }
+-EXPORT_SYMBOL_GPL(dma_request_slave_channel_reason);
++EXPORT_SYMBOL_GPL(dma_request_chan);
+ 
+ /**
+  * dma_request_slave_channel - try to allocate an exclusive slave channel
+@@ -697,17 +744,35 @@ EXPORT_SYMBOL_GPL(dma_request_slave_channel_reason);
+ struct dma_chan *dma_request_slave_channel(struct device *dev,
+ 					   const char *name)
+ {
+-	struct dma_chan *ch = dma_request_slave_channel_reason(dev, name);
++	struct dma_chan *ch = dma_request_chan(dev, name);
+ 	if (IS_ERR(ch))
+ 		return NULL;
+ 
+-	dma_cap_set(DMA_PRIVATE, ch->device->cap_mask);
+-	ch->device->privatecnt++;
+-
+ 	return ch;
+ }
+ EXPORT_SYMBOL_GPL(dma_request_slave_channel);
+ 
++/**
++ * dma_request_chan_by_mask - allocate a channel satisfying certain capabilities
++ * @mask: capabilities that the channel must satisfy
++ *
++ * Returns pointer to appropriate DMA channel on success or an error pointer.
++ */
++struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask)
++{
++	struct dma_chan *chan;
++
++	if (!mask)
++		return ERR_PTR(-ENODEV);
++
++	chan = __dma_request_channel(mask, NULL, NULL);
++	if (!chan)
++		chan = ERR_PTR(-ENODEV);
++
++	return chan;
++}
++EXPORT_SYMBOL_GPL(dma_request_chan_by_mask);
++
+ void dma_release_channel(struct dma_chan *chan)
+ {
+ 	mutex_lock(&dma_list_mutex);
+diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
+index c47c68e..d50a6b51 100644
+--- a/include/linux/dmaengine.h
++++ b/include/linux/dmaengine.h
+@@ -607,11 +607,38 @@ enum dmaengine_alignment {
+ };
+ 
+ /**
++ * struct dma_slave_map - associates slave device and it's slave channel with
++ * parameter to be used by a filter function
++ * @devname: name of the device
++ * @slave: slave channel name
++ * @param: opaque parameter to pass to struct dma_filter.fn
++ */
++struct dma_slave_map {
++	const char *devname;
++	const char *slave;
++	void *param;
++};
++
++/**
++ * struct dma_filter - information for slave device/channel to filter_fn/param
++ * mapping
++ * @fn: filter function callback
++ * @mapcnt: number of slave device/channel in the map
++ * @map: array of channel to filter mapping data
++ */
++struct dma_filter {
++	dma_filter_fn fn;
++	int mapcnt;
++	const struct dma_slave_map *map;
++};
++
++/**
+  * struct dma_device - info on the entity supplying DMA services
+  * @chancnt: how many DMA channels are supported
+  * @privatecnt: how many DMA channels are requested by dma_request_channel
+  * @channels: the list of struct dma_chan
+  * @global_node: list_head for global dma_device_list
++ * @filter: information for device/slave to filter function/param mapping
+  * @cap_mask: one or more dma_capability flags
+  * @max_xor: maximum number of xor sources, 0 if no capability
+  * @max_pq: maximum number of PQ sources and PQ-continue capability
+@@ -666,6 +693,7 @@ struct dma_device {
+ 	unsigned int privatecnt;
+ 	struct list_head channels;
+ 	struct list_head global_node;
++	struct dma_filter filter;
+ 	dma_cap_mask_t  cap_mask;
+ 	unsigned short max_xor;
+ 	unsigned short max_pq;
+@@ -1140,9 +1168,11 @@ enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+ void dma_issue_pending_all(void);
+ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ 					dma_filter_fn fn, void *fn_param);
+-struct dma_chan *dma_request_slave_channel_reason(struct device *dev,
+-						  const char *name);
+ struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
++
++struct dma_chan *dma_request_chan(struct device *dev, const char *name);
++struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
++
+ void dma_release_channel(struct dma_chan *chan);
+ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
+ #else
+@@ -1166,16 +1196,21 @@ static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+ {
+ 	return NULL;
+ }
+-static inline struct dma_chan *dma_request_slave_channel_reason(
+-					struct device *dev, const char *name)
+-{
+-	return ERR_PTR(-ENODEV);
+-}
+ static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
+ 							 const char *name)
+ {
+ 	return NULL;
+ }
++static inline struct dma_chan *dma_request_chan(struct device *dev,
++						const char *name)
++{
++	return ERR_PTR(-ENODEV);
++}
++static inline struct dma_chan *dma_request_chan_by_mask(
++						const dma_cap_mask_t *mask)
++{
++	return ERR_PTR(-ENODEV);
++}
+ static inline void dma_release_channel(struct dma_chan *chan)
+ {
+ }
+@@ -1186,6 +1221,8 @@ static inline int dma_get_slave_caps(struct dma_chan *chan,
+ }
+ #endif
+ 
++#define dma_request_slave_channel_reason(dev, name) dma_request_chan(dev, name)
++
+ static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx)
+ {
+ 	struct dma_slave_caps caps;
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch b/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch
new file mode 100644
index 0000000..8fcf8ca
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/012-dmaengine-Add-transfer-termination-synchronization-s.patch
@@ -0,0 +1,293 @@
+From b36f09c3c441a6e59eab9315032e7d546571de3f Mon Sep 17 00:00:00 2001
+From: Lars-Peter Clausen <lars at metafoo.de>
+Date: Tue, 20 Oct 2015 11:46:28 +0200
+Subject: [PATCH] dmaengine: Add transfer termination synchronization support
+
+The DMAengine API has a long standing race condition that is inherent to
+the API itself. Calling dmaengine_terminate_all() is supposed to stop and
+abort any pending or active transfers that have previously been submitted.
+Unfortunately it is possible that this operation races against a currently
+running (or with some drivers also scheduled) completion callback.
+
+Since the API allows dmaengine_terminate_all() to be called from atomic
+context as well as from within a completion callback it is not possible to
+synchronize to the execution of the completion callback from within
+dmaengine_terminate_all() itself.
+
+This means that a user of the DMAengine API does not know when it is safe
+to free resources used in the completion callback, which can result in a
+use-after-free race condition.
+
+This patch addresses the issue by introducing an explicit synchronization
+primitive to the DMAengine API called dmaengine_synchronize().
+
+The existing dmaengine_terminate_all() is deprecated in favor of
+dmaengine_terminate_sync() and dmaengine_terminate_async(). The former
+aborts all pending and active transfers and synchronizes to the current
+context, meaning it will wait until all running completion callbacks have
+finished. This means it is only possible to call this function from
+non-atomic context. The later function does not synchronize, but can still
+be used in atomic context or from within a complete callback. It has to be
+followed up by dmaengine_synchronize() before a client can free the
+resources used in a completion callback.
+
+In addition to this the semantics of the device_terminate_all() callback
+are slightly relaxed by this patch. It is now OK for a driver to only
+schedule the termination of the active transfer, but does not necessarily
+have to wait until the DMA controller has completely stopped. The driver
+must ensure though that the controller has stopped and no longer accesses
+any memory when the device_synchronize() callback returns.
+
+This was in part done since most drivers do not pay attention to this
+anyway at the moment and to emphasize that this needs to be done when the
+device_synchronize() callback is implemented. But it also helps with
+implementing support for devices where stopping the controller can require
+operations that may sleep.
+
+Signed-off-by: Lars-Peter Clausen <lars at metafoo.de>
+Signed-off-by: Vinod Koul <vinod.koul at intel.com>
+---
+ Documentation/dmaengine/client.txt   | 38 ++++++++++++++-
+ Documentation/dmaengine/provider.txt | 20 +++++++-
+ drivers/dma/dmaengine.c              |  5 +-
+ include/linux/dmaengine.h            | 90 ++++++++++++++++++++++++++++++++++++
+ 4 files changed, 148 insertions(+), 5 deletions(-)
+
+diff --git a/Documentation/dmaengine/client.txt b/Documentation/dmaengine/client.txt
+index 11fb87f..d9f9f46 100644
+--- a/Documentation/dmaengine/client.txt
++++ b/Documentation/dmaengine/client.txt
+@@ -128,7 +128,7 @@ The slave DMA usage consists of following steps:
+ 	transaction.
+ 
+ 	For cyclic DMA, a callback function may wish to terminate the
+-	DMA via dmaengine_terminate_all().
++	DMA via dmaengine_terminate_async().
+ 
+ 	Therefore, it is important that DMA engine drivers drop any
+ 	locks before calling the callback function which may cause a
+@@ -166,12 +166,29 @@ The slave DMA usage consists of following steps:
+ 
+ Further APIs:
+ 
+-1. int dmaengine_terminate_all(struct dma_chan *chan)
++1. int dmaengine_terminate_sync(struct dma_chan *chan)
++   int dmaengine_terminate_async(struct dma_chan *chan)
++   int dmaengine_terminate_all(struct dma_chan *chan) /* DEPRECATED */
+ 
+    This causes all activity for the DMA channel to be stopped, and may
+    discard data in the DMA FIFO which hasn't been fully transferred.
+    No callback functions will be called for any incomplete transfers.
+ 
++   Two variants of this function are available.
++
++   dmaengine_terminate_async() might not wait until the DMA has been fully
++   stopped or until any running complete callbacks have finished. But it is
++   possible to call dmaengine_terminate_async() from atomic context or from
++   within a complete callback. dmaengine_synchronize() must be called before it
++   is safe to free the memory accessed by the DMA transfer or free resources
++   accessed from within the complete callback.
++
++   dmaengine_terminate_sync() will wait for the transfer and any running
++   complete callbacks to finish before it returns. But the function must not be
++   called from atomic context or from within a complete callback.
++
++   dmaengine_terminate_all() is deprecated and should not be used in new code.
++
+ 2. int dmaengine_pause(struct dma_chan *chan)
+ 
+    This pauses activity on the DMA channel without data loss.
+@@ -197,3 +214,20 @@ Further APIs:
+ 	a running DMA channel.  It is recommended that DMA engine users
+ 	pause or stop (via dmaengine_terminate_all()) the channel before
+ 	using this API.
++
++5. void dmaengine_synchronize(struct dma_chan *chan)
++
++  Synchronize the termination of the DMA channel to the current context.
++
++  This function should be used after dmaengine_terminate_async() to synchronize
++  the termination of the DMA channel to the current context. The function will
++  wait for the transfer and any running complete callbacks to finish before it
++  returns.
++
++  If dmaengine_terminate_async() is used to stop the DMA channel this function
++  must be called before it is safe to free memory accessed by previously
++  submitted descriptors or to free any resources accessed within the complete
++  callback of previously submitted descriptors.
++
++  The behavior of this function is undefined if dma_async_issue_pending() has
++  been called between dmaengine_terminate_async() and this function.
+diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt
+index 67d4ce4..122b7f4 100644
+--- a/Documentation/dmaengine/provider.txt
++++ b/Documentation/dmaengine/provider.txt
+@@ -327,8 +327,24 @@ supported.
+ 
+    * device_terminate_all
+      - Aborts all the pending and ongoing transfers on the channel
+-     - This command should operate synchronously on the channel,
+-       terminating right away all the channels
++     - For aborted transfers the complete callback should not be called
++     - Can be called from atomic context or from within a complete
++       callback of a descriptor. Must not sleep. Drivers must be able
++       to handle this correctly.
++     - Termination may be asynchronous. The driver does not have to
++       wait until the currently active transfer has completely stopped.
++       See device_synchronize.
++
++   * device_synchronize
++     - Must synchronize the termination of a channel to the current
++       context.
++     - Must make sure that memory for previously submitted
++       descriptors is no longer accessed by the DMA controller.
++     - Must make sure that all complete callbacks for previously
++       submitted descriptors have finished running and none are
++       scheduled to run.
++     - May sleep.
++
+ 
+ Misc notes (stuff that should be documented, but don't really know
+ where to put them)
+diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
+index 3ecec14..d6fc82e 100644
+--- a/drivers/dma/dmaengine.c
++++ b/drivers/dma/dmaengine.c
+@@ -265,8 +265,11 @@ static void dma_chan_put(struct dma_chan *chan)
+ 	module_put(dma_chan_to_owner(chan));
+ 
+ 	/* This channel is not in use anymore, free it */
+-	if (!chan->client_count && chan->device->device_free_chan_resources)
++	if (!chan->client_count && chan->device->device_free_chan_resources) {
++		/* Make sure all operations have completed */
++		dmaengine_synchronize(chan);
+ 		chan->device->device_free_chan_resources(chan);
++	}
+ 
+ 	/* If the channel is used via a DMA request router, free the mapping */
+ 	if (chan->router && chan->router->route_free) {
+diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
+index c47c68e..4662d9a 100644
+--- a/include/linux/dmaengine.h
++++ b/include/linux/dmaengine.h
+@@ -654,6 +654,8 @@ enum dmaengine_alignment {
+  *	paused. Returns 0 or an error code
+  * @device_terminate_all: Aborts all transfers on a channel. Returns 0
+  *	or an error code
++ * @device_synchronize: Synchronizes the termination of a transfers to the
++ *  current context.
+  * @device_tx_status: poll for transaction completion, the optional
+  *	txstate parameter can be supplied with a pointer to get a
+  *	struct with auxiliary transfer status information, otherwise the call
+@@ -737,6 +739,7 @@ struct dma_device {
+ 	int (*device_pause)(struct dma_chan *chan);
+ 	int (*device_resume)(struct dma_chan *chan);
+ 	int (*device_terminate_all)(struct dma_chan *chan);
++	void (*device_synchronize)(struct dma_chan *chan);
+ 
+ 	enum dma_status (*device_tx_status)(struct dma_chan *chan,
+ 					    dma_cookie_t cookie,
+@@ -828,6 +831,13 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
+ 			src_sg, src_nents, flags);
+ }
+ 
++/**
++ * dmaengine_terminate_all() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * This function is DEPRECATED use either dmaengine_terminate_sync() or
++ * dmaengine_terminate_async() instead.
++ */
+ static inline int dmaengine_terminate_all(struct dma_chan *chan)
+ {
+ 	if (chan->device->device_terminate_all)
+@@ -836,6 +846,86 @@ static inline int dmaengine_terminate_all(struct dma_chan *chan)
+ 	return -ENOSYS;
+ }
+ 
++/**
++ * dmaengine_terminate_async() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * Calling this function will terminate all active and pending descriptors
++ * that have previously been submitted to the channel. It is not guaranteed
++ * though that the transfer for the active descriptor has stopped when the
++ * function returns. Furthermore it is possible the complete callback of a
++ * submitted transfer is still running when this function returns.
++ *
++ * dmaengine_synchronize() needs to be called before it is safe to free
++ * any memory that is accessed by previously submitted descriptors or before
++ * freeing any resources accessed from within the completion callback of any
++ * perviously submitted descriptors.
++ *
++ * This function can be called from atomic context as well as from within a
++ * complete callback of a descriptor submitted on the same channel.
++ *
++ * If none of the two conditions above apply consider using
++ * dmaengine_terminate_sync() instead.
++ */
++static inline int dmaengine_terminate_async(struct dma_chan *chan)
++{
++	if (chan->device->device_terminate_all)
++		return chan->device->device_terminate_all(chan);
++
++	return -EINVAL;
++}
++
++/**
++ * dmaengine_synchronize() - Synchronize DMA channel termination
++ * @chan: The channel to synchronize
++ *
++ * Synchronizes to the DMA channel termination to the current context. When this
++ * function returns it is guaranteed that all transfers for previously issued
++ * descriptors have stopped and and it is safe to free the memory assoicated
++ * with them. Furthermore it is guaranteed that all complete callback functions
++ * for a previously submitted descriptor have finished running and it is safe to
++ * free resources accessed from within the complete callbacks.
++ *
++ * The behavior of this function is undefined if dma_async_issue_pending() has
++ * been called between dmaengine_terminate_async() and this function.
++ *
++ * This function must only be called from non-atomic context and must not be
++ * called from within a complete callback of a descriptor submitted on the same
++ * channel.
++ */
++static inline void dmaengine_synchronize(struct dma_chan *chan)
++{
++	if (chan->device->device_synchronize)
++		chan->device->device_synchronize(chan);
++}
++
++/**
++ * dmaengine_terminate_sync() - Terminate all active DMA transfers
++ * @chan: The channel for which to terminate the transfers
++ *
++ * Calling this function will terminate all active and pending transfers
++ * that have previously been submitted to the channel. It is similar to
++ * dmaengine_terminate_async() but guarantees that the DMA transfer has actually
++ * stopped and that all complete callbacks have finished running when the
++ * function returns.
++ *
++ * This function must only be called from non-atomic context and must not be
++ * called from within a complete callback of a descriptor submitted on the same
++ * channel.
++ */
++static inline int dmaengine_terminate_sync(struct dma_chan *chan)
++{
++	int ret;
++
++	ret = dmaengine_terminate_async(chan);
++	if (ret)
++		return ret;
++
++	dmaengine_synchronize(chan);
++
++	return 0;
++}
++
+ static inline int dmaengine_pause(struct dma_chan *chan)
+ {
+ 	if (chan->device->device_pause)
+-- 
+2.8.1
+
diff --git a/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch b/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch
new file mode 100644
index 0000000..96b11a8
--- /dev/null
+++ b/target/linux/apm821xx/patches-4.4/015-dmaengine-dw-fixed.patch
@@ -0,0 +1,1522 @@
+From: Andy Shevchenko <andriy.shevchenko at linux.intel.com>
+Subject: [PATCH v6 0/4] Fixes / cleanups in dw_dmac (affects on few subsystems)
+Date: Mon, 25 Apr 2016 15:35:05 +0300
+
+This patch series (v3: http://www.spinics.net/lists/kernel/msg2215303.html)
+contains a number of mostly minor fixes and cleanups for the DW DMA driver. A
+couple of them affect the DT binding so these may need to be updated to
+maintain compatibility (old format is still supported though). The rest should
+be relatively straight-forward.
+
+This version has been tested on the following bare metal platforms:
+- ATNGW100 (avr32 based platform) with dmatest
+- Sam460ex (powerpc 44x based platform) with SATA
+- Intel Braswell with UART
+- Intel Galileo (Intel Quark based platform) with UART
+
+(SATA driver and Intel Galileo UART support are based on this series and just
+ published recently for a review)
+
+Vinod, there are few patch sets developed on top of this one, so, the idea is
+to keep this in an immuutable branch / tag.
+
+Changes since v5:
+- fixed an issue found by kbuildbot
+
+Changes since v4:
+- send proper set of patches
+- add changelog
+
+Changes since v3:
+- add patch 1 to check value of dma-masters property
+- drop the upstreamed patches
+- update patch 2 to keep an array for data-width property as well
+
+Changes since v2:
+- add patch 1 to fix master selection which was broken for long time
+- remove "use field-by-field initialization" patch since like Mans metioned in
+  has mostly no value and even might increase error prone
+- rebase on top of recent linux-next
+- wide testing on several platforms
+
+Changes since v1:
+- zeroing struct dw_dma_slave before use
+- fall back to old data_width property if data-width is not found
+- append tags for few patches
+- correct title of cover letter
+- rebase on top of recent linux-next
+
+Andy Shevchenko (4):
+  dmaengine: dw: platform: check nr_masters to be non-zero
+  dmaengine: dw: revisit data_width property
+  dmaengine: dw: keep entire platform data in struct dw_dma
+  dmaengine: dw: pass platform data via struct dw_dma_chip
+
+ Documentation/devicetree/bindings/dma/snps-dma.txt |  6 +-
+ arch/arc/boot/dts/abilis_tb10x.dtsi                |  2 +-
+ arch/arm/boot/dts/spear13xx.dtsi                   |  4 +-
+ drivers/ata/sata_dwc_460ex.c                       |  2 +-
+ drivers/dma/dw/core.c                              | 75 ++++++++--------------
+ drivers/dma/dw/pci.c                               |  5 +-
+ drivers/dma/dw/platform.c                          | 32 +++++----
+ drivers/dma/dw/regs.h                              |  5 +-
+ include/linux/dma/dw.h                             |  5 +-
+ include/linux/platform_data/dma-dw.h               |  4 +-
+ sound/soc/intel/common/sst-firmware.c              |  2 +-
+ 11 files changed, 64 insertions(+), 78 deletions(-)
+
+--- a/drivers/dma/dw/core.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/core.c	2016-05-21 22:47:08.665465180 +0200
+@@ -45,22 +45,19 @@
+ 			DW_DMA_MSIZE_16;			\
+ 		u8 _dmsize = _is_slave ? _sconfig->dst_maxburst :	\
+ 			DW_DMA_MSIZE_16;			\
++		u8 _dms = (_dwc->direction == DMA_MEM_TO_DEV) ?		\
++			_dwc->p_master : _dwc->m_master;		\
++		u8 _sms = (_dwc->direction == DMA_DEV_TO_MEM) ?		\
++			_dwc->p_master : _dwc->m_master;		\
+ 								\
+ 		(DWC_CTLL_DST_MSIZE(_dmsize)			\
+ 		 | DWC_CTLL_SRC_MSIZE(_smsize)			\
+ 		 | DWC_CTLL_LLP_D_EN				\
+ 		 | DWC_CTLL_LLP_S_EN				\
+-		 | DWC_CTLL_DMS(_dwc->dst_master)		\
+-		 | DWC_CTLL_SMS(_dwc->src_master));		\
++		 | DWC_CTLL_DMS(_dms)				\
++		 | DWC_CTLL_SMS(_sms));				\
+ 	})
+ 
+-/*
+- * Number of descriptors to allocate for each channel. This should be
+- * made configurable somehow; preferably, the clients (at least the
+- * ones using slave transfers) should be able to give us a hint.
+- */
+-#define NR_DESCS_PER_CHANNEL	64
+-
+ /* The set of bus widths supported by the DMA controller */
+ #define DW_DMA_BUSWIDTHS			  \
+ 	BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED)	| \
+@@ -80,51 +77,65 @@ static struct dw_desc *dwc_first_active(
+ 	return to_dw_desc(dwc->active_list.next);
+ }
+ 
+-static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
++static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
+ {
+-	struct dw_desc *desc, *_desc;
+-	struct dw_desc *ret = NULL;
+-	unsigned int i = 0;
+-	unsigned long flags;
++	struct dw_desc		*desc = txd_to_dw_desc(tx);
++	struct dw_dma_chan	*dwc = to_dw_dma_chan(tx->chan);
++	dma_cookie_t		cookie;
++	unsigned long		flags;
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+-	list_for_each_entry_safe(desc, _desc, &dwc->free_list, desc_node) {
+-		i++;
+-		if (async_tx_test_ack(&desc->txd)) {
+-			list_del(&desc->desc_node);
+-			ret = desc;
+-			break;
+-		}
+-		dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc);
+-	}
++	cookie = dma_cookie_assign(tx);
++
++	/*
++	 * REVISIT: We should attempt to chain as many descriptors as
++	 * possible, perhaps even appending to those already submitted
++	 * for DMA. But this is hard to do in a race-free manner.
++	 */
++
++	list_add_tail(&desc->desc_node, &dwc->queue);
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
++	dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n",
++		 __func__, desc->txd.cookie);
+ 
+-	dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i);
++	return cookie;
++}
+ 
+-	return ret;
++static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
++{
++	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
++	struct dw_desc *desc;
++	dma_addr_t phys;
++
++	desc = dma_pool_zalloc(dw->desc_pool, GFP_ATOMIC, &phys);
++	if (!desc)
++		return NULL;
++
++	dwc->descs_allocated++;
++	INIT_LIST_HEAD(&desc->tx_list);
++	dma_async_tx_descriptor_init(&desc->txd, &dwc->chan);
++	desc->txd.tx_submit = dwc_tx_submit;
++	desc->txd.flags = DMA_CTRL_ACK;
++	desc->txd.phys = phys;
++	return desc;
+ }
+ 
+-/*
+- * Move a descriptor, including any children, to the free list.
+- * `desc' must not be on any lists.
+- */
+ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
+ {
+-	unsigned long flags;
++	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
++	struct dw_desc *child, *_next;
+ 
+-	if (desc) {
+-		struct dw_desc *child;
++	if (unlikely(!desc))
++		return;
+ 
+-		spin_lock_irqsave(&dwc->lock, flags);
+-		list_for_each_entry(child, &desc->tx_list, desc_node)
+-			dev_vdbg(chan2dev(&dwc->chan),
+-					"moving child desc %p to freelist\n",
+-					child);
+-		list_splice_init(&desc->tx_list, &dwc->free_list);
+-		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
+-		list_add(&desc->desc_node, &dwc->free_list);
+-		spin_unlock_irqrestore(&dwc->lock, flags);
++	list_for_each_entry_safe(child, _next, &desc->tx_list, desc_node) {
++		list_del(&child->desc_node);
++		dma_pool_free(dw->desc_pool, child, child->txd.phys);
++		dwc->descs_allocated--;
+ 	}
++
++	dma_pool_free(dw->desc_pool, desc, desc->txd.phys);
++	dwc->descs_allocated--;
+ }
+ 
+ static void dwc_initialize(struct dw_dma_chan *dwc)
+@@ -133,7 +144,7 @@ static void dwc_initialize(struct dw_dma
+ 	u32 cfghi = DWC_CFGH_FIFO_MODE;
+ 	u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
+ 
+-	if (dwc->initialized == true)
++	if (test_bit(DW_DMA_IS_INITIALIZED, &dwc->flags))
+ 		return;
+ 
+ 	cfghi |= DWC_CFGH_DST_PER(dwc->dst_id);
+@@ -146,26 +157,11 @@ static void dwc_initialize(struct dw_dma
+ 	channel_set_bit(dw, MASK.XFER, dwc->mask);
+ 	channel_set_bit(dw, MASK.ERROR, dwc->mask);
+ 
+-	dwc->initialized = true;
++	set_bit(DW_DMA_IS_INITIALIZED, &dwc->flags);
+ }
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-static inline unsigned int dwc_fast_ffs(unsigned long long v)
+-{
+-	/*
+-	 * We can be a lot more clever here, but this should take care
+-	 * of the most common optimization.
+-	 */
+-	if (!(v & 7))
+-		return 3;
+-	else if (!(v & 3))
+-		return 2;
+-	else if (!(v & 1))
+-		return 1;
+-	return 0;
+-}
+-
+ static inline void dwc_dump_chan_regs(struct dw_dma_chan *dwc)
+ {
+ 	dev_err(chan2dev(&dwc->chan),
+@@ -197,12 +193,12 @@ static inline void dwc_do_single_block(s
+ 	 * Software emulation of LLP mode relies on interrupts to continue
+ 	 * multi block transfer.
+ 	 */
+-	ctllo = desc->lli.ctllo | DWC_CTLL_INT_EN;
++	ctllo = lli_read(desc, ctllo) | DWC_CTLL_INT_EN;
+ 
+-	channel_writel(dwc, SAR, desc->lli.sar);
+-	channel_writel(dwc, DAR, desc->lli.dar);
++	channel_writel(dwc, SAR, lli_read(desc, sar));
++	channel_writel(dwc, DAR, lli_read(desc, dar));
+ 	channel_writel(dwc, CTL_LO, ctllo);
+-	channel_writel(dwc, CTL_HI, desc->lli.ctlhi);
++	channel_writel(dwc, CTL_HI, lli_read(desc, ctlhi));
+ 	channel_set_bit(dw, CH_EN, dwc->mask);
+ 
+ 	/* Move pointer to next descriptor */
+@@ -213,6 +209,7 @@ static inline void dwc_do_single_block(s
+ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
+ {
+ 	struct dw_dma	*dw = to_dw_dma(dwc->chan.device);
++	u8		lms = DWC_LLP_LMS(dwc->m_master);
+ 	unsigned long	was_soft_llp;
+ 
+ 	/* ASSERT:  channel is idle */
+@@ -237,7 +234,7 @@ static void dwc_dostart(struct dw_dma_ch
+ 
+ 		dwc_initialize(dwc);
+ 
+-		dwc->residue = first->total_len;
++		first->residue = first->total_len;
+ 		dwc->tx_node_active = &first->tx_list;
+ 
+ 		/* Submit first block */
+@@ -248,9 +245,8 @@ static void dwc_dostart(struct dw_dma_ch
+ 
+ 	dwc_initialize(dwc);
+ 
+-	channel_writel(dwc, LLP, first->txd.phys);
+-	channel_writel(dwc, CTL_LO,
+-			DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
++	channel_writel(dwc, LLP, first->txd.phys | lms);
++	channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	channel_writel(dwc, CTL_HI, 0);
+ 	channel_set_bit(dw, CH_EN, dwc->mask);
+ }
+@@ -293,11 +289,7 @@ dwc_descriptor_complete(struct dw_dma_ch
+ 	list_for_each_entry(child, &desc->tx_list, desc_node)
+ 		async_tx_ack(&child->txd);
+ 	async_tx_ack(&desc->txd);
+-
+-	list_splice_init(&desc->tx_list, &dwc->free_list);
+-	list_move(&desc->desc_node, &dwc->free_list);
+-
+-	dma_descriptor_unmap(txd);
++	dwc_desc_put(dwc, desc);
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+ 	if (callback)
+@@ -368,11 +360,11 @@ static void dwc_scan_descriptors(struct
+ 
+ 			head = &desc->tx_list;
+ 			if (active != head) {
+-				/* Update desc to reflect last sent one */
+-				if (active != head->next)
+-					desc = to_dw_desc(active->prev);
+-
+-				dwc->residue -= desc->len;
++				/* Update residue to reflect last sent descriptor */
++				if (active == head->next)
++					desc->residue -= desc->len;
++				else
++					desc->residue -= to_dw_desc(active->prev)->len;
+ 
+ 				child = to_dw_desc(active);
+ 
+@@ -387,8 +379,6 @@ static void dwc_scan_descriptors(struct
+ 			clear_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags);
+ 		}
+ 
+-		dwc->residue = 0;
+-
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+ 		dwc_complete_all(dw, dwc);
+@@ -396,7 +386,6 @@ static void dwc_scan_descriptors(struct
+ 	}
+ 
+ 	if (list_empty(&dwc->active_list)) {
+-		dwc->residue = 0;
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 		return;
+ 	}
+@@ -411,31 +400,31 @@ static void dwc_scan_descriptors(struct
+ 
+ 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
+ 		/* Initial residue value */
+-		dwc->residue = desc->total_len;
++		desc->residue = desc->total_len;
+ 
+ 		/* Check first descriptors addr */
+-		if (desc->txd.phys == llp) {
++		if (desc->txd.phys == DWC_LLP_LOC(llp)) {
+ 			spin_unlock_irqrestore(&dwc->lock, flags);
+ 			return;
+ 		}
+ 
+ 		/* Check first descriptors llp */
+-		if (desc->lli.llp == llp) {
++		if (lli_read(desc, llp) == llp) {
+ 			/* This one is currently in progress */
+-			dwc->residue -= dwc_get_sent(dwc);
++			desc->residue -= dwc_get_sent(dwc);
+ 			spin_unlock_irqrestore(&dwc->lock, flags);
+ 			return;
+ 		}
+ 
+-		dwc->residue -= desc->len;
++		desc->residue -= desc->len;
+ 		list_for_each_entry(child, &desc->tx_list, desc_node) {
+-			if (child->lli.llp == llp) {
++			if (lli_read(child, llp) == llp) {
+ 				/* Currently in progress */
+-				dwc->residue -= dwc_get_sent(dwc);
++				desc->residue -= dwc_get_sent(dwc);
+ 				spin_unlock_irqrestore(&dwc->lock, flags);
+ 				return;
+ 			}
+-			dwc->residue -= child->len;
++			desc->residue -= child->len;
+ 		}
+ 
+ 		/*
+@@ -457,10 +446,14 @@ static void dwc_scan_descriptors(struct
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ }
+ 
+-static inline void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli)
++static inline void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_desc *desc)
+ {
+ 	dev_crit(chan2dev(&dwc->chan), "  desc: s0x%x d0x%x l0x%x c0x%x:%x\n",
+-		 lli->sar, lli->dar, lli->llp, lli->ctlhi, lli->ctllo);
++		 lli_read(desc, sar),
++		 lli_read(desc, dar),
++		 lli_read(desc, llp),
++		 lli_read(desc, ctlhi),
++		 lli_read(desc, ctllo));
+ }
+ 
+ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
+@@ -496,9 +489,9 @@ static void dwc_handle_error(struct dw_d
+ 	 */
+ 	dev_WARN(chan2dev(&dwc->chan), "Bad descriptor submitted for DMA!\n"
+ 				       "  cookie: %d\n", bad_desc->txd.cookie);
+-	dwc_dump_lli(dwc, &bad_desc->lli);
++	dwc_dump_lli(dwc, bad_desc);
+ 	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
+-		dwc_dump_lli(dwc, &child->lli);
++		dwc_dump_lli(dwc, child);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -549,7 +542,7 @@ static void dwc_handle_cyclic(struct dw_
+ 	 */
+ 	if (unlikely(status_err & dwc->mask) ||
+ 			unlikely(status_xfer & dwc->mask)) {
+-		int i;
++		unsigned int i;
+ 
+ 		dev_err(chan2dev(&dwc->chan),
+ 			"cyclic DMA unexpected %s interrupt, stopping DMA transfer\n",
+@@ -571,7 +564,7 @@ static void dwc_handle_cyclic(struct dw_
+ 		dma_writel(dw, CLEAR.XFER, dwc->mask);
+ 
+ 		for (i = 0; i < dwc->cdesc->periods; i++)
+-			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
++			dwc_dump_lli(dwc, dwc->cdesc->desc[i]);
+ 
+ 		spin_unlock_irqrestore(&dwc->lock, flags);
+ 	}
+@@ -589,7 +582,7 @@ static void dw_dma_tasklet(unsigned long
+ 	u32 status_block;
+ 	u32 status_xfer;
+ 	u32 status_err;
+-	int i;
++	unsigned int i;
+ 
+ 	status_block = dma_readl(dw, RAW.BLOCK);
+ 	status_xfer = dma_readl(dw, RAW.XFER);
+@@ -616,12 +609,17 @@ static void dw_dma_tasklet(unsigned long
+ static irqreturn_t dw_dma_interrupt(int irq, void *dev_id)
+ {
+ 	struct dw_dma *dw = dev_id;
+-	u32 status = dma_readl(dw, STATUS_INT);
++	u32 status;
++
++	/* Check if we have any interrupt from the DMAC which is not in use */
++	if (!dw->in_use)
++		return IRQ_NONE;
+ 
++	status = dma_readl(dw, STATUS_INT);
+ 	dev_vdbg(dw->dma.dev, "%s: status=0x%x\n", __func__, status);
+ 
+ 	/* Check if we have any interrupt from the DMAC */
+-	if (!status || !dw->in_use)
++	if (!status)
+ 		return IRQ_NONE;
+ 
+ 	/*
+@@ -653,30 +651,6 @@ static irqreturn_t dw_dma_interrupt(int
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
+-{
+-	struct dw_desc		*desc = txd_to_dw_desc(tx);
+-	struct dw_dma_chan	*dwc = to_dw_dma_chan(tx->chan);
+-	dma_cookie_t		cookie;
+-	unsigned long		flags;
+-
+-	spin_lock_irqsave(&dwc->lock, flags);
+-	cookie = dma_cookie_assign(tx);
+-
+-	/*
+-	 * REVISIT: We should attempt to chain as many descriptors as
+-	 * possible, perhaps even appending to those already submitted
+-	 * for DMA. But this is hard to do in a race-free manner.
+-	 */
+-
+-	dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__, desc->txd.cookie);
+-	list_add_tail(&desc->desc_node, &dwc->queue);
+-
+-	spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-	return cookie;
+-}
+-
+ static struct dma_async_tx_descriptor *
+ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+ 		size_t len, unsigned long flags)
+@@ -688,10 +662,12 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 	struct dw_desc		*prev;
+ 	size_t			xfer_count;
+ 	size_t			offset;
++	u8			m_master = dwc->m_master;
+ 	unsigned int		src_width;
+ 	unsigned int		dst_width;
+-	unsigned int		data_width;
++	unsigned int		data_width = dw->pdata->data_width[m_master];
+ 	u32			ctllo;
++	u8			lms = DWC_LLP_LMS(m_master);
+ 
+ 	dev_vdbg(chan2dev(chan),
+ 			"%s: d%pad s%pad l0x%zx f0x%lx\n", __func__,
+@@ -704,11 +680,7 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 
+ 	dwc->direction = DMA_MEM_TO_MEM;
+ 
+-	data_width = min_t(unsigned int, dw->data_width[dwc->src_master],
+-			   dw->data_width[dwc->dst_master]);
+-
+-	src_width = dst_width = min_t(unsigned int, data_width,
+-				      dwc_fast_ffs(src | dest | len));
++	src_width = dst_width = __ffs(data_width | src | dest | len);
+ 
+ 	ctllo = DWC_DEFAULT_CTLLO(chan)
+ 			| DWC_CTLL_DST_WIDTH(dst_width)
+@@ -726,27 +698,27 @@ dwc_prep_dma_memcpy(struct dma_chan *cha
+ 		if (!desc)
+ 			goto err_desc_get;
+ 
+-		desc->lli.sar = src + offset;
+-		desc->lli.dar = dest + offset;
+-		desc->lli.ctllo = ctllo;
+-		desc->lli.ctlhi = xfer_count;
++		lli_write(desc, sar, src + offset);
++		lli_write(desc, dar, dest + offset);
++		lli_write(desc, ctllo, ctllo);
++		lli_write(desc, ctlhi, xfer_count);
+ 		desc->len = xfer_count << src_width;
+ 
+ 		if (!first) {
+ 			first = desc;
+ 		} else {
+-			prev->lli.llp = desc->txd.phys;
+-			list_add_tail(&desc->desc_node,
+-					&first->tx_list);
++			lli_write(prev, llp, desc->txd.phys | lms);
++			list_add_tail(&desc->desc_node, &first->tx_list);
+ 		}
+ 		prev = desc;
+ 	}
+ 
+ 	if (flags & DMA_PREP_INTERRUPT)
+ 		/* Trigger interrupt after last block */
+-		prev->lli.ctllo |= DWC_CTLL_INT_EN;
++		lli_set(prev, ctllo, DWC_CTLL_INT_EN);
+ 
+ 	prev->lli.llp = 0;
++	lli_clear(prev, ctllo, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	first->txd.flags = flags;
+ 	first->total_len = len;
+ 
+@@ -768,10 +740,12 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 	struct dw_desc		*prev;
+ 	struct dw_desc		*first;
+ 	u32			ctllo;
++	u8			m_master = dwc->m_master;
++	u8			lms = DWC_LLP_LMS(m_master);
+ 	dma_addr_t		reg;
+ 	unsigned int		reg_width;
+ 	unsigned int		mem_width;
+-	unsigned int		data_width;
++	unsigned int		data_width = dw->pdata->data_width[m_master];
+ 	unsigned int		i;
+ 	struct scatterlist	*sg;
+ 	size_t			total_len = 0;
+@@ -797,8 +771,6 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
+ 			DWC_CTLL_FC(DW_DMA_FC_D_M2P);
+ 
+-		data_width = dw->data_width[dwc->src_master];
+-
+ 		for_each_sg(sgl, sg, sg_len, i) {
+ 			struct dw_desc	*desc;
+ 			u32		len, dlen, mem;
+@@ -806,17 +778,16 @@ dwc_prep_slave_sg(struct dma_chan *chan,
+ 			mem = sg_dma_address(sg);
+ 			len = sg_dma_len(sg);
+ 
+-			mem_width = min_t(unsigned int,
+-					  data_width, dwc_fast_ffs(mem | len));
++			mem_width = __ffs(data_width | mem | len);
+ 
+ slave_sg_todev_fill_desc:
+ 			desc = dwc_desc_get(dwc);
+ 			if (!desc)
+ 				goto err_desc_get;
+ 
+-			desc->lli.sar = mem;
+-			desc->lli.dar = reg;
+-			desc->lli.ctllo = ctllo | DWC_CTLL_SRC_WIDTH(mem_width);
++			lli_write(desc, sar, mem);
++			lli_write(desc, dar, reg);
++			lli_write(desc, ctllo, ctllo | DWC_CTLL_SRC_WIDTH(mem_width));
+ 			if ((len >> mem_width) > dwc->block_size) {
+ 				dlen = dwc->block_size << mem_width;
+ 				mem += dlen;
+@@ -826,15 +797,14 @@ slave_sg_todev_fill_desc:
+ 				len = 0;
+ 			}
+ 
+-			desc->lli.ctlhi = dlen >> mem_width;
++			lli_write(desc, ctlhi, dlen >> mem_width);
+ 			desc->len = dlen;
+ 
+ 			if (!first) {
+ 				first = desc;
+ 			} else {
+-				prev->lli.llp = desc->txd.phys;
+-				list_add_tail(&desc->desc_node,
+-						&first->tx_list);
++				lli_write(prev, llp, desc->txd.phys | lms);
++				list_add_tail(&desc->desc_node, &first->tx_list);
+ 			}
+ 			prev = desc;
+ 			total_len += dlen;
+@@ -854,8 +824,6 @@ slave_sg_todev_fill_desc:
+ 		ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
+ 			DWC_CTLL_FC(DW_DMA_FC_D_P2M);
+ 
+-		data_width = dw->data_width[dwc->dst_master];
+-
+ 		for_each_sg(sgl, sg, sg_len, i) {
+ 			struct dw_desc	*desc;
+ 			u32		len, dlen, mem;
+@@ -863,17 +831,16 @@ slave_sg_todev_fill_desc:
+ 			mem = sg_dma_address(sg);
+ 			len = sg_dma_len(sg);
+ 
+-			mem_width = min_t(unsigned int,
+-					  data_width, dwc_fast_ffs(mem | len));
++			mem_width = __ffs(data_width | mem | len);
+ 
+ slave_sg_fromdev_fill_desc:
+ 			desc = dwc_desc_get(dwc);
+ 			if (!desc)
+ 				goto err_desc_get;
+ 
+-			desc->lli.sar = reg;
+-			desc->lli.dar = mem;
+-			desc->lli.ctllo = ctllo | DWC_CTLL_DST_WIDTH(mem_width);
++			lli_write(desc, sar, reg);
++			lli_write(desc, dar, mem);
++			lli_write(desc, ctllo, ctllo | DWC_CTLL_DST_WIDTH(mem_width));
+ 			if ((len >> reg_width) > dwc->block_size) {
+ 				dlen = dwc->block_size << reg_width;
+ 				mem += dlen;
+@@ -882,15 +849,14 @@ slave_sg_fromdev_fill_desc:
+ 				dlen = len;
+ 				len = 0;
+ 			}
+-			desc->lli.ctlhi = dlen >> reg_width;
++			lli_write(desc, ctlhi, dlen >> reg_width);
+ 			desc->len = dlen;
+ 
+ 			if (!first) {
+ 				first = desc;
+ 			} else {
+-				prev->lli.llp = desc->txd.phys;
+-				list_add_tail(&desc->desc_node,
+-						&first->tx_list);
++				lli_write(prev, llp, desc->txd.phys | lms);
++				list_add_tail(&desc->desc_node, &first->tx_list);
+ 			}
+ 			prev = desc;
+ 			total_len += dlen;
+@@ -905,9 +871,10 @@ slave_sg_fromdev_fill_desc:
+ 
+ 	if (flags & DMA_PREP_INTERRUPT)
+ 		/* Trigger interrupt after last block */
+-		prev->lli.ctllo |= DWC_CTLL_INT_EN;
++		lli_set(prev, ctllo, DWC_CTLL_INT_EN);
+ 
+ 	prev->lli.llp = 0;
++	lli_clear(prev, ctllo, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+ 	first->total_len = total_len;
+ 
+ 	return &first->txd;
+@@ -932,8 +899,8 @@ bool dw_dma_filter(struct dma_chan *chan
+ 	dwc->src_id = dws->src_id;
+ 	dwc->dst_id = dws->dst_id;
+ 
+-	dwc->src_master = dws->src_master;
+-	dwc->dst_master = dws->dst_master;
++	dwc->m_master = dws->m_master;
++	dwc->p_master = dws->p_master;
+ 
+ 	return true;
+ }
+@@ -986,7 +953,7 @@ static int dwc_pause(struct dma_chan *ch
+ 	while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY) && count--)
+ 		udelay(2);
+ 
+-	dwc->paused = true;
++	set_bit(DW_DMA_IS_PAUSED, &dwc->flags);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -999,7 +966,7 @@ static inline void dwc_chan_resume(struc
+ 
+ 	channel_writel(dwc, CFG_LO, cfglo & ~DWC_CFGL_CH_SUSP);
+ 
+-	dwc->paused = false;
++	clear_bit(DW_DMA_IS_PAUSED, &dwc->flags);
+ }
+ 
+ static int dwc_resume(struct dma_chan *chan)
+@@ -1007,12 +974,10 @@ static int dwc_resume(struct dma_chan *c
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	unsigned long		flags;
+ 
+-	if (!dwc->paused)
+-		return 0;
+-
+ 	spin_lock_irqsave(&dwc->lock, flags);
+ 
+-	dwc_chan_resume(dwc);
++	if (test_bit(DW_DMA_IS_PAUSED, &dwc->flags))
++		dwc_chan_resume(dwc);
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 
+@@ -1048,16 +1013,37 @@ static int dwc_terminate_all(struct dma_
+ 	return 0;
+ }
+ 
+-static inline u32 dwc_get_residue(struct dw_dma_chan *dwc)
++static struct dw_desc *dwc_find_desc(struct dw_dma_chan *dwc, dma_cookie_t c)
++{
++	struct dw_desc *desc;
++
++	list_for_each_entry(desc, &dwc->active_list, desc_node)
++		if (desc->txd.cookie == c)
++			return desc;
++
++	return NULL;
++}
++
++static u32 dwc_get_residue(struct dw_dma_chan *dwc, dma_cookie_t cookie)
+ {
++	struct dw_desc *desc;
+ 	unsigned long flags;
+ 	u32 residue;
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+ 
+-	residue = dwc->residue;
+-	if (test_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags) && residue)
+-		residue -= dwc_get_sent(dwc);
++	desc = dwc_find_desc(dwc, cookie);
++	if (desc) {
++		if (desc == dwc_first_active(dwc)) {
++			residue = desc->residue;
++			if (test_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags) && residue)
++				residue -= dwc_get_sent(dwc);
++		} else {
++			residue = desc->total_len;
++		}
++	} else {
++		residue = 0;
++	}
+ 
+ 	spin_unlock_irqrestore(&dwc->lock, flags);
+ 	return residue;
+@@ -1078,10 +1064,12 @@ dwc_tx_status(struct dma_chan *chan,
+ 	dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
+ 
+ 	ret = dma_cookie_status(chan, cookie, txstate);
+-	if (ret != DMA_COMPLETE)
+-		dma_set_residue(txstate, dwc_get_residue(dwc));
++	if (ret == DMA_COMPLETE)
++		return ret;
++
++	dma_set_residue(txstate, dwc_get_residue(dwc, cookie));
+ 
+-	if (dwc->paused && ret == DMA_IN_PROGRESS)
++	if (test_bit(DW_DMA_IS_PAUSED, &dwc->flags) && ret == DMA_IN_PROGRESS)
+ 		return DMA_PAUSED;
+ 
+ 	return ret;
+@@ -1102,7 +1090,7 @@ static void dwc_issue_pending(struct dma
+ 
+ static void dw_dma_off(struct dw_dma *dw)
+ {
+-	int i;
++	unsigned int i;
+ 
+ 	dma_writel(dw, CFG, 0);
+ 
+@@ -1116,7 +1104,7 @@ static void dw_dma_off(struct dw_dma *dw
+ 		cpu_relax();
+ 
+ 	for (i = 0; i < dw->dma.chancnt; i++)
+-		dw->chan[i].initialized = false;
++		clear_bit(DW_DMA_IS_INITIALIZED, &dw->chan[i].flags);
+ }
+ 
+ static void dw_dma_on(struct dw_dma *dw)
+@@ -1128,9 +1116,6 @@ static int dwc_alloc_chan_resources(stru
+ {
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(chan->device);
+-	struct dw_desc		*desc;
+-	int			i;
+-	unsigned long		flags;
+ 
+ 	dev_vdbg(chan2dev(chan), "%s\n", __func__);
+ 
+@@ -1161,48 +1146,13 @@ static int dwc_alloc_chan_resources(stru
+ 		dw_dma_on(dw);
+ 	dw->in_use |= dwc->mask;
+ 
+-	spin_lock_irqsave(&dwc->lock, flags);
+-	i = dwc->descs_allocated;
+-	while (dwc->descs_allocated < NR_DESCS_PER_CHANNEL) {
+-		dma_addr_t phys;
+-
+-		spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-		desc = dma_pool_alloc(dw->desc_pool, GFP_ATOMIC, &phys);
+-		if (!desc)
+-			goto err_desc_alloc;
+-
+-		memset(desc, 0, sizeof(struct dw_desc));
+-
+-		INIT_LIST_HEAD(&desc->tx_list);
+-		dma_async_tx_descriptor_init(&desc->txd, chan);
+-		desc->txd.tx_submit = dwc_tx_submit;
+-		desc->txd.flags = DMA_CTRL_ACK;
+-		desc->txd.phys = phys;
+-
+-		dwc_desc_put(dwc, desc);
+-
+-		spin_lock_irqsave(&dwc->lock, flags);
+-		i = ++dwc->descs_allocated;
+-	}
+-
+-	spin_unlock_irqrestore(&dwc->lock, flags);
+-
+-	dev_dbg(chan2dev(chan), "%s: allocated %d descriptors\n", __func__, i);
+-
+-	return i;
+-
+-err_desc_alloc:
+-	dev_info(chan2dev(chan), "only allocated %d descriptors\n", i);
+-
+-	return i;
++	return 0;
+ }
+ 
+ static void dwc_free_chan_resources(struct dma_chan *chan)
+ {
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(chan->device);
+-	struct dw_desc		*desc, *_desc;
+ 	unsigned long		flags;
+ 	LIST_HEAD(list);
+ 
+@@ -1215,17 +1165,15 @@ static void dwc_free_chan_resources(stru
+ 	BUG_ON(dma_readl(to_dw_dma(chan->device), CH_EN) & dwc->mask);
+ 
+ 	spin_lock_irqsave(&dwc->lock, flags);
+-	list_splice_init(&dwc->free_list, &list);
+-	dwc->descs_allocated = 0;
+ 
+ 	/* Clear custom channel configuration */
+ 	dwc->src_id = 0;
+ 	dwc->dst_id = 0;
+ 
+-	dwc->src_master = 0;
+-	dwc->dst_master = 0;
++	dwc->m_master = 0;
++	dwc->p_master = 0;
+ 
+-	dwc->initialized = false;
++	clear_bit(DW_DMA_IS_INITIALIZED, &dwc->flags);
+ 
+ 	/* Disable interrupts */
+ 	channel_clear_bit(dw, MASK.XFER, dwc->mask);
+@@ -1239,11 +1187,6 @@ static void dwc_free_chan_resources(stru
+ 	if (!dw->in_use)
+ 		dw_dma_off(dw);
+ 
+-	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
+-		dev_vdbg(chan2dev(chan), "  freeing descriptor %p\n", desc);
+-		dma_pool_free(dw->desc_pool, desc, desc->txd.phys);
+-	}
+-
+ 	dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
+ }
+ 
+@@ -1321,6 +1264,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 	struct dw_cyclic_desc		*retval = NULL;
+ 	struct dw_desc			*desc;
+ 	struct dw_desc			*last = NULL;
++	u8				lms = DWC_LLP_LMS(dwc->m_master);
+ 	unsigned long			was_cyclic;
+ 	unsigned int			reg_width;
+ 	unsigned int			periods;
+@@ -1374,9 +1318,6 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 
+ 	retval = ERR_PTR(-ENOMEM);
+ 
+-	if (periods > NR_DESCS_PER_CHANNEL)
+-		goto out_err;
+-
+ 	cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+ 	if (!cdesc)
+ 		goto out_err;
+@@ -1392,50 +1333,50 @@ struct dw_cyclic_desc *dw_dma_cyclic_pre
+ 
+ 		switch (direction) {
+ 		case DMA_MEM_TO_DEV:
+-			desc->lli.dar = sconfig->dst_addr;
+-			desc->lli.sar = buf_addr + (period_len * i);
+-			desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan)
+-					| DWC_CTLL_DST_WIDTH(reg_width)
+-					| DWC_CTLL_SRC_WIDTH(reg_width)
+-					| DWC_CTLL_DST_FIX
+-					| DWC_CTLL_SRC_INC
+-					| DWC_CTLL_INT_EN);
+-
+-			desc->lli.ctllo |= sconfig->device_fc ?
+-				DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
+-				DWC_CTLL_FC(DW_DMA_FC_D_M2P);
++			lli_write(desc, dar, sconfig->dst_addr);
++			lli_write(desc, sar, buf_addr + period_len * i);
++			lli_write(desc, ctllo, (DWC_DEFAULT_CTLLO(chan)
++				| DWC_CTLL_DST_WIDTH(reg_width)
++				| DWC_CTLL_SRC_WIDTH(reg_width)
++				| DWC_CTLL_DST_FIX
++				| DWC_CTLL_SRC_INC
++				| DWC_CTLL_INT_EN));
++
++			lli_set(desc, ctllo, sconfig->device_fc ?
++					DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
++					DWC_CTLL_FC(DW_DMA_FC_D_M2P));
+ 
+ 			break;
+ 		case DMA_DEV_TO_MEM:
+-			desc->lli.dar = buf_addr + (period_len * i);
+-			desc->lli.sar = sconfig->src_addr;
+-			desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan)
+-					| DWC_CTLL_SRC_WIDTH(reg_width)
+-					| DWC_CTLL_DST_WIDTH(reg_width)
+-					| DWC_CTLL_DST_INC
+-					| DWC_CTLL_SRC_FIX
+-					| DWC_CTLL_INT_EN);
+-
+-			desc->lli.ctllo |= sconfig->device_fc ?
+-				DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
+-				DWC_CTLL_FC(DW_DMA_FC_D_P2M);
++			lli_write(desc, dar, buf_addr + period_len * i);
++			lli_write(desc, sar, sconfig->src_addr);
++			lli_write(desc, ctllo, (DWC_DEFAULT_CTLLO(chan)
++				| DWC_CTLL_SRC_WIDTH(reg_width)
++				| DWC_CTLL_DST_WIDTH(reg_width)
++				| DWC_CTLL_DST_INC
++				| DWC_CTLL_SRC_FIX
++				| DWC_CTLL_INT_EN));
++
++			lli_set(desc, ctllo, sconfig->device_fc ?
++					DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
++					DWC_CTLL_FC(DW_DMA_FC_D_P2M));
+ 
+ 			break;
+ 		default:
+ 			break;
+ 		}
+ 
+-		desc->lli.ctlhi = (period_len >> reg_width);
++		lli_write(desc, ctlhi, period_len >> reg_width);
+ 		cdesc->desc[i] = desc;
+ 
+ 		if (last)
+-			last->lli.llp = desc->txd.phys;
++			lli_write(last, llp, desc->txd.phys | lms);
+ 
+ 		last = desc;
+ 	}
+ 
+ 	/* Let's make a cyclic list */
+-	last->lli.llp = cdesc->desc[0]->txd.phys;
++	lli_write(last, llp, cdesc->desc[0]->txd.phys | lms);
+ 
+ 	dev_dbg(chan2dev(&dwc->chan),
+ 			"cyclic prepared buf %pad len %zu period %zu periods %d\n",
+@@ -1466,7 +1407,7 @@ void dw_dma_cyclic_free(struct dma_chan
+ 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+ 	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+ 	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
+-	int			i;
++	unsigned int		i;
+ 	unsigned long		flags;
+ 
+ 	dev_dbg(chan2dev(&dwc->chan), "%s\n", __func__);
+@@ -1490,32 +1431,38 @@ void dw_dma_cyclic_free(struct dma_chan
+ 	kfree(cdesc->desc);
+ 	kfree(cdesc);
+ 
++	dwc->cdesc = NULL;
++
+ 	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+ }
+ EXPORT_SYMBOL(dw_dma_cyclic_free);
+ 
+ /*----------------------------------------------------------------------*/
+ 
+-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata)
++int dw_dma_probe(struct dw_dma_chip *chip)
+ {
++	struct dw_dma_platform_data *pdata;
+ 	struct dw_dma		*dw;
+ 	bool			autocfg = false;
+ 	unsigned int		dw_params;
+-	unsigned int		max_blk_size = 0;
++	unsigned int		i;
+ 	int			err;
+-	int			i;
+ 
+ 	dw = devm_kzalloc(chip->dev, sizeof(*dw), GFP_KERNEL);
+ 	if (!dw)
+ 		return -ENOMEM;
+ 
++	dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL);
++	if (!dw->pdata)
++		return -ENOMEM;
++
+ 	dw->regs = chip->regs;
+ 	chip->dw = dw;
+ 
+ 	pm_runtime_get_sync(chip->dev);
+ 
+-	if (!pdata) {
+-		dw_params = dma_read_byaddr(chip->regs, DW_PARAMS);
++	if (!chip->pdata) {
++		dw_params = dma_readl(dw, DW_PARAMS);
+ 		dev_dbg(chip->dev, "DW_PARAMS: 0x%08x\n", dw_params);
+ 
+ 		autocfg = dw_params >> DW_PARAMS_EN & 1;
+@@ -1524,29 +1471,31 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 			goto err_pdata;
+ 		}
+ 
+-		pdata = devm_kzalloc(chip->dev, sizeof(*pdata), GFP_KERNEL);
+-		if (!pdata) {
+-			err = -ENOMEM;
+-			goto err_pdata;
+-		}
++		/* Reassign the platform data pointer */
++		pdata = dw->pdata;
+ 
+ 		/* Get hardware configuration parameters */
+ 		pdata->nr_channels = (dw_params >> DW_PARAMS_NR_CHAN & 7) + 1;
+ 		pdata->nr_masters = (dw_params >> DW_PARAMS_NR_MASTER & 3) + 1;
+ 		for (i = 0; i < pdata->nr_masters; i++) {
+ 			pdata->data_width[i] =
+-				(dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3) + 2;
++				4 << (dw_params >> DW_PARAMS_DATA_WIDTH(i) & 3);
+ 		}
+-		max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
++		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
+ 
+ 		/* Fill platform data with the default values */
+ 		pdata->is_private = true;
+ 		pdata->is_memcpy = true;
+ 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
+ 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
+-	} else if (pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
++	} else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
+ 		err = -EINVAL;
+ 		goto err_pdata;
++	} else {
++		memcpy(dw->pdata, chip->pdata, sizeof(*dw->pdata));
++
++		/* Reassign the platform data pointer */
++		pdata = dw->pdata;
+ 	}
+ 
+ 	dw->chan = devm_kcalloc(chip->dev, pdata->nr_channels, sizeof(*dw->chan),
+@@ -1556,11 +1505,6 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 		goto err_pdata;
+ 	}
+ 
+-	/* Get hardware configuration parameters */
+-	dw->nr_masters = pdata->nr_masters;
+-	for (i = 0; i < dw->nr_masters; i++)
+-		dw->data_width[i] = pdata->data_width[i];
+-
+ 	/* Calculate all channel mask before DMA setup */
+ 	dw->all_chan_mask = (1 << pdata->nr_channels) - 1;
+ 
+@@ -1607,7 +1551,6 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 
+ 		INIT_LIST_HEAD(&dwc->active_list);
+ 		INIT_LIST_HEAD(&dwc->queue);
+-		INIT_LIST_HEAD(&dwc->free_list);
+ 
+ 		channel_clear_bit(dw, CH_EN, dwc->mask);
+ 
+@@ -1615,11 +1558,9 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 
+ 		/* Hardware configuration */
+ 		if (autocfg) {
+-			unsigned int dwc_params;
+ 			unsigned int r = DW_DMA_MAX_NR_CHANNELS - i - 1;
+-			void __iomem *addr = chip->regs + r * sizeof(u32);
+-
+-			dwc_params = dma_read_byaddr(addr, DWC_PARAMS);
++			void __iomem *addr = &__dw_regs(dw)->DWC_PARAMS[r];
++			unsigned int dwc_params = dma_readl_native(addr);
+ 
+ 			dev_dbg(chip->dev, "DWC_PARAMS[%d]: 0x%08x\n", i,
+ 					   dwc_params);
+@@ -1630,16 +1571,15 @@ int dw_dma_probe(struct dw_dma_chip *chi
+ 			 * up to 0x0a for 4095.
+ 			 */
+ 			dwc->block_size =
+-				(4 << ((max_blk_size >> 4 * i) & 0xf)) - 1;
++				(4 << ((pdata->block_size >> 4 * i) & 0xf)) - 1;
+ 			dwc->nollp =
+ 				(dwc_params >> DWC_PARAMS_MBLK_EN & 0x1) == 0;
+ 		} else {
+ 			dwc->block_size = pdata->block_size;
+ 
+ 			/* Check if channel supports multi block transfer */
+-			channel_writel(dwc, LLP, 0xfffffffc);
+-			dwc->nollp =
+-				(channel_readl(dwc, LLP) & 0xfffffffc) == 0;
++			channel_writel(dwc, LLP, DWC_LLP_LOC(0xffffffff));
++			dwc->nollp = DWC_LLP_LOC(channel_readl(dwc, LLP)) == 0;
+ 			channel_writel(dwc, LLP, 0);
+ 		}
+ 	}
+--- a/drivers/dma/dw/pci.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/pci.c	2016-05-21 22:47:08.665465180 +0200
+@@ -17,8 +17,8 @@
+ 
+ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
+ {
++	const struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
+ 	struct dw_dma_chip *chip;
+-	struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
+ 	int ret;
+ 
+ 	ret = pcim_enable_device(pdev);
+@@ -49,8 +49,9 @@ static int dw_pci_probe(struct pci_dev *
+ 	chip->dev = &pdev->dev;
+ 	chip->regs = pcim_iomap_table(pdev)[0];
+ 	chip->irq = pdev->irq;
++	chip->pdata = pdata;
+ 
+-	ret = dw_dma_probe(chip, pdata);
++	ret = dw_dma_probe(chip);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -108,6 +109,10 @@ static const struct pci_device_id dw_pci
+ 
+ 	/* Haswell */
+ 	{ PCI_VDEVICE(INTEL, 0x9c60) },
++
++	/* Broadwell */
++	{ PCI_VDEVICE(INTEL, 0x9ce0) },
++
+ 	{ }
+ };
+ MODULE_DEVICE_TABLE(pci, dw_pci_id_table);
+--- a/drivers/dma/dw/platform.c	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/platform.c	2016-05-21 22:47:08.665465180 +0200
+@@ -42,13 +42,13 @@ static struct dma_chan *dw_dma_of_xlate(
+ 
+ 	slave.src_id = dma_spec->args[0];
+ 	slave.dst_id = dma_spec->args[0];
+-	slave.src_master = dma_spec->args[1];
+-	slave.dst_master = dma_spec->args[2];
++	slave.m_master = dma_spec->args[1];
++	slave.p_master = dma_spec->args[2];
+ 
+ 	if (WARN_ON(slave.src_id >= DW_DMA_MAX_NR_REQUESTS ||
+ 		    slave.dst_id >= DW_DMA_MAX_NR_REQUESTS ||
+-		    slave.src_master >= dw->nr_masters ||
+-		    slave.dst_master >= dw->nr_masters))
++		    slave.m_master >= dw->pdata->nr_masters ||
++		    slave.p_master >= dw->pdata->nr_masters))
+ 		return NULL;
+ 
+ 	dma_cap_zero(cap);
+@@ -66,8 +66,8 @@ static bool dw_dma_acpi_filter(struct dm
+ 		.dma_dev = dma_spec->dev,
+ 		.src_id = dma_spec->slave_id,
+ 		.dst_id = dma_spec->slave_id,
+-		.src_master = 1,
+-		.dst_master = 0,
++		.m_master = 0,
++		.p_master = 1,
+ 	};
+ 
+ 	return dw_dma_filter(chan, &slave);
+@@ -103,18 +103,28 @@ dw_dma_parse_dt(struct platform_device *
+ 	struct device_node *np = pdev->dev.of_node;
+ 	struct dw_dma_platform_data *pdata;
+ 	u32 tmp, arr[DW_DMA_MAX_NR_MASTERS];
++	u32 nr_masters;
++	u32 nr_channels;
+ 
+ 	if (!np) {
+ 		dev_err(&pdev->dev, "Missing DT data\n");
+ 		return NULL;
+ 	}
+ 
++	if (of_property_read_u32(np, "dma-masters", &nr_masters))
++		return NULL;
++	if (nr_masters < 1 || nr_masters > DW_DMA_MAX_NR_MASTERS)
++		return NULL;
++
++	if (of_property_read_u32(np, "dma-channels", &nr_channels))
++		return NULL;
++
+ 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+ 	if (!pdata)
+ 		return NULL;
+ 
+-	if (of_property_read_u32(np, "dma-channels", &pdata->nr_channels))
+-		return NULL;
++	pdata->nr_masters = nr_masters;
++	pdata->nr_channels = nr_channels;
+ 
+ 	if (of_property_read_bool(np, "is_private"))
+ 		pdata->is_private = true;
+@@ -128,17 +138,13 @@ dw_dma_parse_dt(struct platform_device *
+ 	if (!of_property_read_u32(np, "block_size", &tmp))
+ 		pdata->block_size = tmp;
+ 
+-	if (!of_property_read_u32(np, "dma-masters", &tmp)) {
+-		if (tmp > DW_DMA_MAX_NR_MASTERS)
+-			return NULL;
+-
+-		pdata->nr_masters = tmp;
+-	}
+-
+-	if (!of_property_read_u32_array(np, "data_width", arr,
+-				pdata->nr_masters))
+-		for (tmp = 0; tmp < pdata->nr_masters; tmp++)
++	if (!of_property_read_u32_array(np, "data-width", arr, nr_masters)) {
++		for (tmp = 0; tmp < nr_masters; tmp++)
+ 			pdata->data_width[tmp] = arr[tmp];
++	} else if (!of_property_read_u32_array(np, "data_width", arr, nr_masters)) {
++		for (tmp = 0; tmp < nr_masters; tmp++)
++			pdata->data_width[tmp] = BIT(arr[tmp] & 0x07);
++	}
+ 
+ 	return pdata;
+ }
+@@ -155,8 +161,7 @@ static int dw_probe(struct platform_devi
+ 	struct dw_dma_chip *chip;
+ 	struct device *dev = &pdev->dev;
+ 	struct resource *mem;
+-	const struct acpi_device_id *id;
+-	struct dw_dma_platform_data *pdata;
++	const struct dw_dma_platform_data *pdata;
+ 	int err;
+ 
+ 	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
+@@ -179,13 +184,9 @@ static int dw_probe(struct platform_devi
+ 	pdata = dev_get_platdata(dev);
+ 	if (!pdata)
+ 		pdata = dw_dma_parse_dt(pdev);
+-	if (!pdata && has_acpi_companion(dev)) {
+-		id = acpi_match_device(dev->driver->acpi_match_table, dev);
+-		if (id)
+-			pdata = (struct dw_dma_platform_data *)id->driver_data;
+-	}
+ 
+ 	chip->dev = dev;
++	chip->pdata = pdata;
+ 
+ 	chip->clk = devm_clk_get(chip->dev, "hclk");
+ 	if (IS_ERR(chip->clk))
+@@ -196,7 +197,7 @@ static int dw_probe(struct platform_devi
+ 
+ 	pm_runtime_enable(&pdev->dev);
+ 
+-	err = dw_dma_probe(chip, pdata);
++	err = dw_dma_probe(chip);
+ 	if (err)
+ 		goto err_dw_dma_probe;
+ 
+@@ -239,7 +240,19 @@ static void dw_shutdown(struct platform_
+ {
+ 	struct dw_dma_chip *chip = platform_get_drvdata(pdev);
+ 
++	/*
++	 * We have to call dw_dma_disable() to stop any ongoing transfer. On
++	 * some platforms we can't do that since DMA device is powered off.
++	 * Moreover we have no possibility to check if the platform is affected
++	 * or not. That's why we call pm_runtime_get_sync() / pm_runtime_put()
++	 * unconditionally. On the other hand we can't use
++	 * pm_runtime_suspended() because runtime PM framework is not fully
++	 * used by the driver.
++	 */
++	pm_runtime_get_sync(chip->dev);
+ 	dw_dma_disable(chip);
++	pm_runtime_put_sync_suspend(chip->dev);
++
+ 	clk_disable_unprepare(chip->clk);
+ }
+ 
+@@ -252,17 +265,8 @@ MODULE_DEVICE_TABLE(of, dw_dma_of_id_tab
+ #endif
+ 
+ #ifdef CONFIG_ACPI
+-static struct dw_dma_platform_data dw_dma_acpi_pdata = {
+-	.nr_channels = 8,
+-	.is_private = true,
+-	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
+-	.chan_priority = CHAN_PRIORITY_ASCENDING,
+-	.block_size = 4095,
+-	.nr_masters = 2,
+-};
+-
+ static const struct acpi_device_id dw_dma_acpi_id_table[] = {
+-	{ "INTL9C60", (kernel_ulong_t)&dw_dma_acpi_pdata },
++	{ "INTL9C60", 0 },
+ 	{ }
+ };
+ MODULE_DEVICE_TABLE(acpi, dw_dma_acpi_id_table);
+--- a/drivers/dma/dw/regs.h	2016-05-21 23:13:19.964478443 +0200
++++ b/drivers/dma/dw/regs.h	2016-05-21 22:47:08.665465180 +0200
+@@ -114,10 +114,6 @@ struct dw_dma_regs {
+ #define dma_writel_native writel
+ #endif
+ 
+-/* To access the registers in early stage of probe */
+-#define dma_read_byaddr(addr, name) \
+-	dma_readl_native((addr) + offsetof(struct dw_dma_regs, name))
+-
+ /* Bitfields in DW_PARAMS */
+ #define DW_PARAMS_NR_CHAN	8		/* number of channels */
+ #define DW_PARAMS_NR_MASTER	11		/* number of AHB masters */
+@@ -143,6 +139,10 @@ enum dw_dma_msize {
+ 	DW_DMA_MSIZE_256,
+ };
+ 
++/* Bitfields in LLP */
++#define DWC_LLP_LMS(x)		((x) & 3)	/* list master select */
++#define DWC_LLP_LOC(x)		((x) & ~3)	/* next lli */
++
+ /* Bitfields in CTL_LO */
+ #define DWC_CTLL_INT_EN		(1 << 0)	/* irqs enabled? */
+ #define DWC_CTLL_DST_WIDTH(n)	((n)<<1)	/* bytes per element */
+@@ -150,7 +150,7 @@ enum dw_dma_msize {
+ #define DWC_CTLL_DST_INC	(0<<7)		/* DAR update/not */
+ #define DWC_CTLL_DST_DEC	(1<<7)
+ #define DWC_CTLL_DST_FIX	(2<<7)
+-#define DWC_CTLL_SRC_INC	(0<<7)		/* SAR update/not */
++#define DWC_CTLL_SRC_INC	(0<<9)		/* SAR update/not */
+ #define DWC_CTLL_SRC_DEC	(1<<9)
+ #define DWC_CTLL_SRC_FIX	(2<<9)
+ #define DWC_CTLL_DST_MSIZE(n)	((n)<<11)	/* burst, #elements */
+@@ -216,6 +216,8 @@ enum dw_dma_msize {
+ enum dw_dmac_flags {
+ 	DW_DMA_IS_CYCLIC = 0,
+ 	DW_DMA_IS_SOFT_LLP = 1,
++	DW_DMA_IS_PAUSED = 2,
++	DW_DMA_IS_INITIALIZED = 3,
+ };
+ 
+ struct dw_dma_chan {
+@@ -224,8 +226,6 @@ struct dw_dma_chan {
+ 	u8				mask;
+ 	u8				priority;
+ 	enum dma_transfer_direction	direction;
+-	bool				paused;
+-	bool				initialized;
+ 
+ 	/* software emulation of the LLP transfers */
+ 	struct list_head	*tx_node_active;
+@@ -236,8 +236,6 @@ struct dw_dma_chan {
+ 	unsigned long		flags;
+ 	struct list_head	active_list;
+ 	struct list_head	queue;
+-	struct list_head	free_list;
+-	u32			residue;
+ 	struct dw_cyclic_desc	*cdesc;
+ 
+ 	unsigned int		descs_allocated;
+@@ -249,8 +247,8 @@ struct dw_dma_chan {
+ 	/* custom slave configuration */
+ 	u8			src_id;
+ 	u8			dst_id;
+-	u8			src_master;
+-	u8			dst_master;
++	u8			m_master;
++	u8			p_master;
+ 
+ 	/* configuration passed via .device_config */
+ 	struct dma_slave_config dma_sconfig;
+@@ -283,9 +281,8 @@ struct dw_dma {
+ 	u8			all_chan_mask;
+ 	u8			in_use;
+ 
+-	/* hardware configuration */
+-	unsigned char		nr_masters;
+-	unsigned char		data_width[DW_DMA_MAX_NR_MASTERS];
++	/* platform data */
++	struct dw_dma_platform_data	*pdata;
+ };
+ 
+ static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw)
+@@ -308,32 +305,51 @@ static inline struct dw_dma *to_dw_dma(s
+ 	return container_of(ddev, struct dw_dma, dma);
+ }
+ 
++#ifdef CONFIG_DW_DMAC_BIG_ENDIAN_IO
++typedef __be32 __dw32;
++#else
++typedef __le32 __dw32;
++#endif
++
+ /* LLI == Linked List Item; a.k.a. DMA block descriptor */
+ struct dw_lli {
+ 	/* values that are not changed by hardware */
+-	u32		sar;
+-	u32		dar;
+-	u32		llp;		/* chain to next lli */
+-	u32		ctllo;
++	__dw32		sar;
++	__dw32		dar;
++	__dw32		llp;		/* chain to next lli */
++	__dw32		ctllo;
+ 	/* values that may get written back: */
+-	u32		ctlhi;
++	__dw32		ctlhi;
+ 	/* sstat and dstat can snapshot peripheral register state.
+ 	 * silicon config may discard either or both...
+ 	 */
+-	u32		sstat;
+-	u32		dstat;
++	__dw32		sstat;
++	__dw32		dstat;
+ };
+ 
+ struct dw_desc {
+ 	/* FIRST values the hardware uses */
+ 	struct dw_lli			lli;
+ 
++#ifdef CONFIG_DW_DMAC_BIG_ENDIAN_IO
++#define lli_set(d, reg, v)		((d)->lli.reg |= cpu_to_be32(v))
++#define lli_clear(d, reg, v)		((d)->lli.reg &= ~cpu_to_be32(v))
++#define lli_read(d, reg)		be32_to_cpu((d)->lli.reg)
++#define lli_write(d, reg, v)		((d)->lli.reg = cpu_to_be32(v))
++#else
++#define lli_set(d, reg, v)		((d)->lli.reg |= cpu_to_le32(v))
++#define lli_clear(d, reg, v)		((d)->lli.reg &= ~cpu_to_le32(v))
++#define lli_read(d, reg)		le32_to_cpu((d)->lli.reg)
++#define lli_write(d, reg, v)		((d)->lli.reg = cpu_to_le32(v))
++#endif
++
+ 	/* THEN values for driver housekeeping */
+ 	struct list_head		desc_node;
+ 	struct list_head		tx_list;
+ 	struct dma_async_tx_descriptor	txd;
+ 	size_t				len;
+ 	size_t				total_len;
++	u32				residue;
+ };
+ 
+ #define to_dw_desc(h)	list_entry(h, struct dw_desc, desc_node)
+--- a/include/linux/dma/dw.h
++++ b/include/linux/dma/dw.h
+@@ -27,6 +27,7 @@ struct dw_dma;
+  * @regs:		memory mapped I/O space
+  * @clk:		hclk clock
+  * @dw:			struct dw_dma that is filed by dw_dma_probe()
++ * @pdata:		pointer to platform data
+  */
+ struct dw_dma_chip {
+ 	struct device	*dev;
+@@ -34,10 +35,12 @@ struct dw_dma_chip {
+ 	void __iomem	*regs;
+ 	struct clk	*clk;
+ 	struct dw_dma	*dw;
++
++	const struct dw_dma_platform_data	*pdata;
+ };
+ 
+ /* Export to the platform drivers */
+-int dw_dma_probe(struct dw_dma_chip *chip, struct dw_dma_platform_data *pdata);
++int dw_dma_probe(struct dw_dma_chip *chip);
+ int dw_dma_remove(struct dw_dma_chip *chip);
+ 
+ /* DMA API extensions */
+diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
+index 03b6095..d15d8ba 100644
+--- a/include/linux/platform_data/dma-dw.h
++++ b/include/linux/platform_data/dma-dw.h
+@@ -21,15 +21,15 @@
+  * @dma_dev:	required DMA master device
+  * @src_id:	src request line
+  * @dst_id:	dst request line
+- * @src_master: src master for transfers on allocated channel.
+- * @dst_master: dest master for transfers on allocated channel.
++ * @m_master:	memory master for transfers on allocated channel
++ * @p_master:	peripheral master for transfers on allocated channel
+  */
+ struct dw_dma_slave {
+ 	struct device		*dma_dev;
+ 	u8			src_id;
+ 	u8			dst_id;
+-	u8			src_master;
+-	u8			dst_master;
++	u8			m_master;
++	u8			p_master;
+ };
+ 
+ /**
+@@ -43,7 +43,7 @@ struct dw_dma_slave {
+  * @block_size: Maximum block size supported by the controller
+  * @nr_masters: Number of AHB masters supported by the controller
+  * @data_width: Maximum data width supported by hardware per AHB master
+- *		(0 - 8bits, 1 - 16bits, ..., 5 - 256bits)
++ *		(in bytes, power of 2)
+  */
+ struct dw_dma_platform_data {
+ 	unsigned int	nr_channels;
+@@ -55,7 +55,7 @@ struct dw_dma_platform_data {
+ #define CHAN_PRIORITY_ASCENDING		0	/* chan0 highest */
+ #define CHAN_PRIORITY_DESCENDING	1	/* chan7 highest */
+ 	unsigned char	chan_priority;
+-	unsigned short	block_size;
++	unsigned int	block_size;
+ 	unsigned char	nr_masters;
+ 	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
+ };
+-- 
+2.8.1
+
-- 
2.8.1