Merge tag 'dmaengine-fix-4.2-rc5' of git://git.infradead.org/users/vkoul/slave-dma

Pull dmaengine fixes from Vinod Koul:
 "We had a regression due to reuse of descriptor so we have reverted
  that.

  The rest are driver fixes:

   - at_hdmac and at_xdmac for residue, trannfer width, and channel config
   - pl330 final fix for dma fails and overflow issue
   - xgene resouce map fix
   - mv_xor big endian op fix"

* tag 'dmaengine-fix-4.2-rc5' of git://git.infradead.org/users/vkoul/slave-dma:
  Revert "dmaengine: virt-dma: don't always free descriptor upon completion"
  dmaengine: mv_xor: fix big endian operation in register mode
  dmaengine: xgene-dma: Fix the resource map to handle overlapping
  dmaengine: at_xdmac: fix transfer data width in at_xdmac_prep_slave_sg()
  dmaengine: at_hdmac: fix residue computation
  dmaengine: at_xdmac: fix bug about channel configuration
  dmaengine: pl330: Really fix choppy sound because of wrong residue calculation
  dmaengine: pl330: Fix overflow when reporting residue in memcpy
diff --git a/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt b/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt
index d305876..c53e0b0 100644
--- a/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt
+++ b/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt
@@ -35,7 +35,7 @@
 			device_type = "dma";
 			reg = <0x0 0x1f270000 0x0 0x10000>,
 			      <0x0 0x1f200000 0x0 0x10000>,
-			      <0x0 0x1b008000 0x0 0x2000>,
+			      <0x0 0x1b000000 0x0 0x400000>,
 			      <0x0 0x1054a000 0x0 0x100>;
 			interrupts = <0x0 0x82 0x4>,
 				     <0x0 0xb8 0x4>,
diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi
index 0689c3f..58093ed 100644
--- a/arch/arm64/boot/dts/apm/apm-storm.dtsi
+++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi
@@ -823,7 +823,7 @@
 			device_type = "dma";
 			reg = <0x0 0x1f270000 0x0 0x10000>,
 			      <0x0 0x1f200000 0x0 0x10000>,
-			      <0x0 0x1b008000 0x0 0x2000>,
+			      <0x0 0x1b000000 0x0 0x400000>,
 			      <0x0 0x1054a000 0x0 0x100>;
 			interrupts = <0x0 0x82 0x4>,
 				     <0x0 0xb8 0x4>,
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 59892126..d3629b7 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -48,6 +48,8 @@
 	BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) |\
 	BIT(DMA_SLAVE_BUSWIDTH_4_BYTES))
 
+#define ATC_MAX_DSCR_TRIALS	10
+
 /*
  * Initial number of descriptors to allocate for each channel. This could
  * be increased during dma usage.
@@ -285,28 +287,19 @@
  *
  * @current_len: the number of bytes left before reading CTRLA
  * @ctrla: the value of CTRLA
- * @desc: the descriptor containing the transfer width
  */
-static inline int atc_calc_bytes_left(int current_len, u32 ctrla,
-					struct at_desc *desc)
+static inline int atc_calc_bytes_left(int current_len, u32 ctrla)
 {
-	return current_len - ((ctrla & ATC_BTSIZE_MAX) << desc->tx_width);
-}
+	u32 btsize = (ctrla & ATC_BTSIZE_MAX);
+	u32 src_width = ATC_REG_TO_SRC_WIDTH(ctrla);
 
-/**
- * atc_calc_bytes_left_from_reg - calculates the number of bytes left according
- * to the current value of CTRLA.
- *
- * @current_len: the number of bytes left before reading CTRLA
- * @atchan: the channel to read CTRLA for
- * @desc: the descriptor containing the transfer width
- */
-static inline int atc_calc_bytes_left_from_reg(int current_len,
-			struct at_dma_chan *atchan, struct at_desc *desc)
-{
-	u32 ctrla = channel_readl(atchan, CTRLA);
-
-	return atc_calc_bytes_left(current_len, ctrla, desc);
+	/*
+	 * According to the datasheet, when reading the Control A Register
+	 * (ctrla), the Buffer Transfer Size (btsize) bitfield refers to the
+	 * number of transfers completed on the Source Interface.
+	 * So btsize is always a number of source width transfers.
+	 */
+	return current_len - (btsize << src_width);
 }
 
 /**
@@ -320,7 +313,7 @@
 	struct at_desc *desc_first = atc_first_active(atchan);
 	struct at_desc *desc;
 	int ret;
-	u32 ctrla, dscr;
+	u32 ctrla, dscr, trials;
 
 	/*
 	 * If the cookie doesn't match to the currently running transfer then
@@ -346,15 +339,82 @@
 		 * the channel's DSCR register and compare it against the value
 		 * of the hardware linked list structure of each child
 		 * descriptor.
+		 *
+		 * The CTRLA register provides us with the amount of data
+		 * already read from the source for the current child
+		 * descriptor. So we can compute a more accurate residue by also
+		 * removing the number of bytes corresponding to this amount of
+		 * data.
+		 *
+		 * However, the DSCR and CTRLA registers cannot be read both
+		 * atomically. Hence a race condition may occur: the first read
+		 * register may refer to one child descriptor whereas the second
+		 * read may refer to a later child descriptor in the list
+		 * because of the DMA transfer progression inbetween the two
+		 * reads.
+		 *
+		 * One solution could have been to pause the DMA transfer, read
+		 * the DSCR and CTRLA then resume the DMA transfer. Nonetheless,
+		 * this approach presents some drawbacks:
+		 * - If the DMA transfer is paused, RX overruns or TX underruns
+		 *   are more likey to occur depending on the system latency.
+		 *   Taking the USART driver as an example, it uses a cyclic DMA
+		 *   transfer to read data from the Receive Holding Register
+		 *   (RHR) to avoid RX overruns since the RHR is not protected
+		 *   by any FIFO on most Atmel SoCs. So pausing the DMA transfer
+		 *   to compute the residue would break the USART driver design.
+		 * - The atc_pause() function masks interrupts but we'd rather
+		 *   avoid to do so for system latency purpose.
+		 *
+		 * Then we'd rather use another solution: the DSCR is read a
+		 * first time, the CTRLA is read in turn, next the DSCR is read
+		 * a second time. If the two consecutive read values of the DSCR
+		 * are the same then we assume both refers to the very same
+		 * child descriptor as well as the CTRLA value read inbetween
+		 * does. For cyclic tranfers, the assumption is that a full loop
+		 * is "not so fast".
+		 * If the two DSCR values are different, we read again the CTRLA
+		 * then the DSCR till two consecutive read values from DSCR are
+		 * equal or till the maxium trials is reach.
+		 * This algorithm is very unlikely not to find a stable value for
+		 * DSCR.
 		 */
 
-		ctrla = channel_readl(atchan, CTRLA);
-		rmb(); /* ensure CTRLA is read before DSCR */
 		dscr = channel_readl(atchan, DSCR);
+		rmb(); /* ensure DSCR is read before CTRLA */
+		ctrla = channel_readl(atchan, CTRLA);
+		for (trials = 0; trials < ATC_MAX_DSCR_TRIALS; ++trials) {
+			u32 new_dscr;
+
+			rmb(); /* ensure DSCR is read after CTRLA */
+			new_dscr = channel_readl(atchan, DSCR);
+
+			/*
+			 * If the DSCR register value has not changed inside the
+			 * DMA controller since the previous read, we assume
+			 * that both the dscr and ctrla values refers to the
+			 * very same descriptor.
+			 */
+			if (likely(new_dscr == dscr))
+				break;
+
+			/*
+			 * DSCR has changed inside the DMA controller, so the
+			 * previouly read value of CTRLA may refer to an already
+			 * processed descriptor hence could be outdated.
+			 * We need to update ctrla to match the current
+			 * descriptor.
+			 */
+			dscr = new_dscr;
+			rmb(); /* ensure DSCR is read before CTRLA */
+			ctrla = channel_readl(atchan, CTRLA);
+		}
+		if (unlikely(trials >= ATC_MAX_DSCR_TRIALS))
+			return -ETIMEDOUT;
 
 		/* for the first descriptor we can be more accurate */
 		if (desc_first->lli.dscr == dscr)
-			return atc_calc_bytes_left(ret, ctrla, desc_first);
+			return atc_calc_bytes_left(ret, ctrla);
 
 		ret -= desc_first->len;
 		list_for_each_entry(desc, &desc_first->tx_list, desc_node) {
@@ -365,16 +425,14 @@
 		}
 
 		/*
-		 * For the last descriptor in the chain we can calculate
+		 * For the current descriptor in the chain we can calculate
 		 * the remaining bytes using the channel's register.
-		 * Note that the transfer width of the first and last
-		 * descriptor may differ.
 		 */
-		if (!desc->lli.dscr)
-			ret = atc_calc_bytes_left_from_reg(ret, atchan, desc);
+		ret = atc_calc_bytes_left(ret, ctrla);
 	} else {
 		/* single transfer */
-		ret = atc_calc_bytes_left_from_reg(ret, atchan, desc_first);
+		ctrla = channel_readl(atchan, CTRLA);
+		ret = atc_calc_bytes_left(ret, ctrla);
 	}
 
 	return ret;
@@ -726,7 +784,6 @@
 
 	desc->txd.cookie = -EBUSY;
 	desc->total_len = desc->len = len;
-	desc->tx_width = dwidth;
 
 	/* set end-of-link to the last link descriptor of list*/
 	set_desc_eol(desc);
@@ -804,10 +861,6 @@
 	first->txd.cookie = -EBUSY;
 	first->total_len = len;
 
-	/* set transfer width for the calculation of the residue */
-	first->tx_width = src_width;
-	prev->tx_width = src_width;
-
 	/* set end-of-link to the last link descriptor of list*/
 	set_desc_eol(desc);
 
@@ -956,10 +1009,6 @@
 	first->txd.cookie = -EBUSY;
 	first->total_len = total_len;
 
-	/* set transfer width for the calculation of the residue */
-	first->tx_width = reg_width;
-	prev->tx_width = reg_width;
-
 	/* first link descriptor of list is responsible of flags */
 	first->txd.flags = flags; /* client is in control of this ack */
 
@@ -1077,12 +1126,6 @@
 		desc->txd.cookie = 0;
 		desc->len = len;
 
-		/*
-		 * Although we only need the transfer width for the first and
-		 * the last descriptor, its easier to set it to all descriptors.
-		 */
-		desc->tx_width = src_width;
-
 		atc_desc_chain(&first, &prev, desc);
 
 		/* update the lengths and addresses for the next loop cycle */
@@ -1256,7 +1299,6 @@
 	/* First descriptor of the chain embedds additional information */
 	first->txd.cookie = -EBUSY;
 	first->total_len = buf_len;
-	first->tx_width = reg_width;
 
 	return &first->txd;
 
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index bc8d5eb..7f5a082 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -112,6 +112,7 @@
 #define		ATC_SRC_WIDTH_BYTE	(0x0 << 24)
 #define		ATC_SRC_WIDTH_HALFWORD	(0x1 << 24)
 #define		ATC_SRC_WIDTH_WORD	(0x2 << 24)
+#define		ATC_REG_TO_SRC_WIDTH(r)	(((r) >> 24) & 0x3)
 #define	ATC_DST_WIDTH_MASK	(0x3 << 28)	/* Destination Single Transfer Size */
 #define		ATC_DST_WIDTH(x)	((x) << 28)
 #define		ATC_DST_WIDTH_BYTE	(0x0 << 28)
@@ -182,7 +183,6 @@
  * @txd: support for the async_tx api
  * @desc_node: node on the channed descriptors list
  * @len: descriptor byte count
- * @tx_width: transfer width
  * @total_len: total transaction byte count
  */
 struct at_desc {
@@ -194,7 +194,6 @@
 	struct dma_async_tx_descriptor	txd;
 	struct list_head		desc_node;
 	size_t				len;
-	u32				tx_width;
 	size_t				total_len;
 
 	/* Interleaved data */
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index cf1213d..40afa2a 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -359,18 +359,19 @@
 	 * descriptor view 2 since some fields of the configuration register
 	 * depend on transfer size and src/dest addresses.
 	 */
-	if (at_xdmac_chan_is_cyclic(atchan)) {
+	if (at_xdmac_chan_is_cyclic(atchan))
 		reg = AT_XDMAC_CNDC_NDVIEW_NDV1;
-		at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg);
-	} else if (first->lld.mbr_ubc & AT_XDMAC_MBR_UBC_NDV3) {
+	else if (first->lld.mbr_ubc & AT_XDMAC_MBR_UBC_NDV3)
 		reg = AT_XDMAC_CNDC_NDVIEW_NDV3;
-	} else {
-		/*
-		 * No need to write AT_XDMAC_CC reg, it will be done when the
-		 * descriptor is fecthed.
-		 */
+	else
 		reg = AT_XDMAC_CNDC_NDVIEW_NDV2;
-	}
+	/*
+	 * Even if the register will be updated from the configuration in the
+	 * descriptor when using view 2 or higher, the PROT bit won't be set
+	 * properly. This bit can be modified only by using the channel
+	 * configuration register.
+	 */
+	at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg);
 
 	reg |= AT_XDMAC_CNDC_NDDUP
 	       | AT_XDMAC_CNDC_NDSUP
@@ -681,15 +682,16 @@
 			desc->lld.mbr_sa = mem;
 			desc->lld.mbr_da = atchan->sconfig.dst_addr;
 		}
-		desc->lld.mbr_cfg = atchan->cfg;
-		dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg);
+		dwidth = at_xdmac_get_dwidth(atchan->cfg);
 		fixed_dwidth = IS_ALIGNED(len, 1 << dwidth)
-			       ? at_xdmac_get_dwidth(desc->lld.mbr_cfg)
+			       ? dwidth
 			       : AT_XDMAC_CC_DWIDTH_BYTE;
 		desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2			/* next descriptor view */
 			| AT_XDMAC_MBR_UBC_NDEN					/* next descriptor dst parameter update */
 			| AT_XDMAC_MBR_UBC_NSEN					/* next descriptor src parameter update */
 			| (len >> fixed_dwidth);				/* microblock length */
+		desc->lld.mbr_cfg = (atchan->cfg & ~AT_XDMAC_CC_DWIDTH_MASK) |
+				    AT_XDMAC_CC_DWIDTH(fixed_dwidth);
 		dev_dbg(chan2dev(chan),
 			 "%s: lld: mbr_sa=%pad, mbr_da=%pad, mbr_ubc=0x%08x\n",
 			 __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc);
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index fbaf1ea..f1325f6 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -162,10 +162,11 @@
 	config &= ~0x7;
 	config |= op_mode;
 
-	if (IS_ENABLED(__BIG_ENDIAN))
-		config |= XOR_DESCRIPTOR_SWAP;
-	else
-		config &= ~XOR_DESCRIPTOR_SWAP;
+#if defined(__BIG_ENDIAN)
+	config |= XOR_DESCRIPTOR_SWAP;
+#else
+	config &= ~XOR_DESCRIPTOR_SWAP;
+#endif
 
 	writel_relaxed(config, XOR_CONFIG(chan));
 	chan->current_type = type;
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index f513f77..ecab4ea0 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -2328,7 +2328,7 @@
 			desc->txd.callback = last->txd.callback;
 			desc->txd.callback_param = last->txd.callback_param;
 		}
-		last->last = false;
+		desc->last = false;
 
 		dma_cookie_assign(&desc->txd);
 
@@ -2623,6 +2623,7 @@
 		desc->rqcfg.brst_len = 1;
 
 	desc->rqcfg.brst_len = get_burst_len(desc, len);
+	desc->bytes_requested = len;
 
 	desc->txd.flags = flags;
 
diff --git a/drivers/dma/virt-dma.c b/drivers/dma/virt-dma.c
index 7d2c17d..6f80432 100644
--- a/drivers/dma/virt-dma.c
+++ b/drivers/dma/virt-dma.c
@@ -29,7 +29,7 @@
 	spin_lock_irqsave(&vc->lock, flags);
 	cookie = dma_cookie_assign(tx);
 
-	list_move_tail(&vd->node, &vc->desc_submitted);
+	list_add_tail(&vd->node, &vc->desc_submitted);
 	spin_unlock_irqrestore(&vc->lock, flags);
 
 	dev_dbg(vc->chan.device->dev, "vchan %p: txd %p[%x]: submitted\n",
@@ -83,10 +83,8 @@
 		cb_data = vd->tx.callback_param;
 
 		list_del(&vd->node);
-		if (async_tx_test_ack(&vd->tx))
-			list_add(&vd->node, &vc->desc_allocated);
-		else
-			vc->desc_free(vd);
+
+		vc->desc_free(vd);
 
 		if (cb)
 			cb(cb_data);
@@ -98,13 +96,9 @@
 	while (!list_empty(head)) {
 		struct virt_dma_desc *vd = list_first_entry(head,
 			struct virt_dma_desc, node);
-		if (async_tx_test_ack(&vd->tx)) {
-			list_move_tail(&vd->node, &vc->desc_allocated);
-		} else {
-			dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd);
-			list_del(&vd->node);
-			vc->desc_free(vd);
-		}
+		list_del(&vd->node);
+		dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd);
+		vc->desc_free(vd);
 	}
 }
 EXPORT_SYMBOL_GPL(vchan_dma_desc_free_list);
@@ -114,7 +108,6 @@
 	dma_cookie_init(&vc->chan);
 
 	spin_lock_init(&vc->lock);
-	INIT_LIST_HEAD(&vc->desc_allocated);
 	INIT_LIST_HEAD(&vc->desc_submitted);
 	INIT_LIST_HEAD(&vc->desc_issued);
 	INIT_LIST_HEAD(&vc->desc_completed);
diff --git a/drivers/dma/virt-dma.h b/drivers/dma/virt-dma.h
index 189e75d..181b952 100644
--- a/drivers/dma/virt-dma.h
+++ b/drivers/dma/virt-dma.h
@@ -29,7 +29,6 @@
 	spinlock_t lock;
 
 	/* protected by vc.lock */
-	struct list_head desc_allocated;
 	struct list_head desc_submitted;
 	struct list_head desc_issued;
 	struct list_head desc_completed;
@@ -56,16 +55,11 @@
 	struct virt_dma_desc *vd, unsigned long tx_flags)
 {
 	extern dma_cookie_t vchan_tx_submit(struct dma_async_tx_descriptor *);
-	unsigned long flags;
 
 	dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
 	vd->tx.flags = tx_flags;
 	vd->tx.tx_submit = vchan_tx_submit;
 
-	spin_lock_irqsave(&vc->lock, flags);
-	list_add_tail(&vd->node, &vc->desc_allocated);
-	spin_unlock_irqrestore(&vc->lock, flags);
-
 	return &vd->tx;
 }
 
@@ -128,8 +122,7 @@
 }
 
 /**
- * vchan_get_all_descriptors - obtain all allocated, submitted and issued
- *                             descriptors
+ * vchan_get_all_descriptors - obtain all submitted and issued descriptors
  * vc: virtual channel to get descriptors from
  * head: list of descriptors found
  *
@@ -141,7 +134,6 @@
 static inline void vchan_get_all_descriptors(struct virt_dma_chan *vc,
 	struct list_head *head)
 {
-	list_splice_tail_init(&vc->desc_allocated, head);
 	list_splice_tail_init(&vc->desc_submitted, head);
 	list_splice_tail_init(&vc->desc_issued, head);
 	list_splice_tail_init(&vc->desc_completed, head);
@@ -149,14 +141,11 @@
 
 static inline void vchan_free_chan_resources(struct virt_dma_chan *vc)
 {
-	struct virt_dma_desc *vd;
 	unsigned long flags;
 	LIST_HEAD(head);
 
 	spin_lock_irqsave(&vc->lock, flags);
 	vchan_get_all_descriptors(vc, &head);
-	list_for_each_entry(vd, &head, node)
-		async_tx_clear_ack(&vd->tx);
 	spin_unlock_irqrestore(&vc->lock, flags);
 
 	vchan_dma_desc_free_list(vc, &head);
diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c
index 620fd55ec..dff22ab 100644
--- a/drivers/dma/xgene-dma.c
+++ b/drivers/dma/xgene-dma.c
@@ -111,6 +111,7 @@
 #define XGENE_DMA_MEM_RAM_SHUTDOWN		0xD070
 #define XGENE_DMA_BLK_MEM_RDY			0xD074
 #define XGENE_DMA_BLK_MEM_RDY_VAL		0xFFFFFFFF
+#define XGENE_DMA_RING_CMD_SM_OFFSET		0x8000
 
 /* X-Gene SoC EFUSE csr register and bit defination */
 #define XGENE_SOC_JTAG1_SHADOW			0x18
@@ -1887,6 +1888,8 @@
 		return -ENOMEM;
 	}
 
+	pdma->csr_ring_cmd += XGENE_DMA_RING_CMD_SM_OFFSET;
+
 	/* Get efuse csr region */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 3);
 	if (!res) {