Merge tag 'dmaengine-fix-4.7-rc4' of git://git.infradead.org/users/vkoul/slave-dma

Pull dmaengine fixes from Vinod Koul:
 "Some fixes has piled up, so time to send them upstream.

  These fixes include:
   - at_xdmac fixes for residue and other stuff
   - update MAINTAINERS for dma dt bindings
   - mv_xor fix for incorrect offset"

* tag 'dmaengine-fix-4.7-rc4' of git://git.infradead.org/users/vkoul/slave-dma:
  dmaengine: mv_xor: Fix incorrect offset in dma_map_page()
  dmaengine: at_xdmac: double FIFO flush needed to compute residue
  dmaengine: at_xdmac: fix residue corruption
  dmaengine: at_xdmac: align descriptors on 64 bits
  MAINTAINERS: Add file patterns for dma device tree bindings
diff --git a/MAINTAINERS b/MAINTAINERS
index 98f8a5c..e1b090f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3778,6 +3778,7 @@
 S:	Maintained
 F:	drivers/dma/
 F:	include/linux/dmaengine.h
+F:	Documentation/devicetree/bindings/dma/
 F:	Documentation/dmaengine/
 T:	git git://git.infradead.org/users/vkoul/slave-dma.git
 
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 8e304b1..75bd662 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -242,7 +242,7 @@
 	u32		mbr_dus;	/* Destination Microblock Stride Register */
 };
 
-
+/* 64-bit alignment needed to update CNDA and CUBC registers in an atomic way. */
 struct at_xdmac_desc {
 	struct at_xdmac_lld		lld;
 	enum dma_transfer_direction	direction;
@@ -253,7 +253,7 @@
 	unsigned int			xfer_size;
 	struct list_head		descs_list;
 	struct list_head		xfer_node;
-};
+} __aligned(sizeof(u64));
 
 static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb)
 {
@@ -1400,6 +1400,7 @@
 	u32			cur_nda, check_nda, cur_ubc, mask, value;
 	u8			dwidth = 0;
 	unsigned long		flags;
+	bool			initd;
 
 	ret = dma_cookie_status(chan, cookie, txstate);
 	if (ret == DMA_COMPLETE)
@@ -1424,7 +1425,16 @@
 	residue = desc->xfer_size;
 	/*
 	 * Flush FIFO: only relevant when the transfer is source peripheral
-	 * synchronized.
+	 * synchronized. Flush is needed before reading CUBC because data in
+	 * the FIFO are not reported by CUBC. Reporting a residue of the
+	 * transfer length while we have data in FIFO can cause issue.
+	 * Usecase: atmel USART has a timeout which means I have received
+	 * characters but there is no more character received for a while. On
+	 * timeout, it requests the residue. If the data are in the DMA FIFO,
+	 * we will return a residue of the transfer length. It means no data
+	 * received. If an application is waiting for these data, it will hang
+	 * since we won't have another USART timeout without receiving new
+	 * data.
 	 */
 	mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC;
 	value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM;
@@ -1435,34 +1445,43 @@
 	}
 
 	/*
-	 * When processing the residue, we need to read two registers but we
-	 * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
-	 * we stand in the descriptor list and AT_XDMAC_CUBC is used
-	 * to know how many data are remaining for the current descriptor.
-	 * Since the dma channel is not paused to not loose data, between the
-	 * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
-	 * descriptor.
-	 * For that reason, after reading AT_XDMAC_CUBC, we check if we are
-	 * still using the same descriptor by reading a second time
-	 * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
-	 * read again AT_XDMAC_CUBC.
+	 * The easiest way to compute the residue should be to pause the DMA
+	 * but doing this can lead to miss some data as some devices don't
+	 * have FIFO.
+	 * We need to read several registers because:
+	 * - DMA is running therefore a descriptor change is possible while
+	 * reading these registers
+	 * - When the block transfer is done, the value of the CUBC register
+	 * is set to its initial value until the fetch of the next descriptor.
+	 * This value will corrupt the residue calculation so we have to skip
+	 * it.
+	 *
+	 * INITD --------                    ------------
+	 *              |____________________|
+	 *       _______________________  _______________
+	 * NDA       @desc2             \/   @desc3
+	 *       _______________________/\_______________
+	 *       __________  ___________  _______________
+	 * CUBC       0    \/ MAX desc1 \/  MAX desc2
+	 *       __________/\___________/\_______________
+	 *
+	 * Since descriptors are aligned on 64 bits, we can assume that
+	 * the update of NDA and CUBC is atomic.
 	 * Memory barriers are used to ensure the read order of the registers.
-	 * A max number of retries is set because unlikely it can never ends if
-	 * we are transferring a lot of data with small buffers.
+	 * A max number of retries is set because unlikely it could never ends.
 	 */
-	cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
-	rmb();
-	cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
 	for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
-		rmb();
 		check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
-
-		if (likely(cur_nda == check_nda))
-			break;
-
-		cur_nda = check_nda;
+		rmb();
+		initd = !!(at_xdmac_chan_read(atchan, AT_XDMAC_CC) & AT_XDMAC_CC_INITD);
 		rmb();
 		cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+		rmb();
+		cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+		rmb();
+
+		if ((check_nda == cur_nda) && initd)
+			break;
 	}
 
 	if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
@@ -1471,6 +1490,19 @@
 	}
 
 	/*
+	 * Flush FIFO: only relevant when the transfer is source peripheral
+	 * synchronized. Another flush is needed here because CUBC is updated
+	 * when the controller sends the data write command. It can lead to
+	 * report data that are not written in the memory or the device. The
+	 * FIFO flush ensures that data are really written.
+	 */
+	if ((desc->lld.mbr_cfg & mask) == value) {
+		at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask);
+		while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS))
+			cpu_relax();
+	}
+
+	/*
 	 * Remove size of all microblocks already transferred and the current
 	 * one. Then add the remaining size to transfer of the current
 	 * microblock.
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 25d1dad..d0446a7 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -703,8 +703,9 @@
 		goto free_resources;
 	}
 
-	src_dma = dma_map_page(dma_chan->device->dev, virt_to_page(src), 0,
-				 PAGE_SIZE, DMA_TO_DEVICE);
+	src_dma = dma_map_page(dma_chan->device->dev, virt_to_page(src),
+			       (size_t)src & ~PAGE_MASK, PAGE_SIZE,
+			       DMA_TO_DEVICE);
 	unmap->addr[0] = src_dma;
 
 	ret = dma_mapping_error(dma_chan->device->dev, src_dma);
@@ -714,8 +715,9 @@
 	}
 	unmap->to_cnt = 1;
 
-	dest_dma = dma_map_page(dma_chan->device->dev, virt_to_page(dest), 0,
-				  PAGE_SIZE, DMA_FROM_DEVICE);
+	dest_dma = dma_map_page(dma_chan->device->dev, virt_to_page(dest),
+				(size_t)dest & ~PAGE_MASK, PAGE_SIZE,
+				DMA_FROM_DEVICE);
 	unmap->addr[1] = dest_dma;
 
 	ret = dma_mapping_error(dma_chan->device->dev, dest_dma);