patches/svcrdma-fix-send_reply-scatter-gather-set-up.patch - pub/scm/linux/kernel/git/lizf/linux-3.4.y-queue - Git at Google

 From 9d11b51ce7c150a69e761e30518f294fc73d55ff Mon Sep 17 00:00:00 2001
 From: Chuck Lever <chuck.lever@oracle.com>
 Date: Thu, 9 Jul 2015 16:45:18 -0400
 Subject: svcrdma: Fix send_reply() scatter/gather set-up

 commit 9d11b51ce7c150a69e761e30518f294fc73d55ff upstream.

 The Linux NFS server returns garbage in the data payload of inline
 NFS/RDMA READ replies. These are READs of under 1000 bytes or so
 where the client has not provided either a reply chunk or a write
 list.

 The NFS server delivers the data payload for an NFS READ reply to
 the transport in an xdr_buf page list. If the NFS client did not
 provide a reply chunk or a write list, send_reply() is supposed to
 set up a separate sge for the page containing the READ data, and
 another sge for XDR padding if needed, then post all of the sges via
 a single SEND Work Request.

 The problem is send_reply() does not advance through the xdr_buf
 when setting up scatter/gather entries for SEND WR. It always calls
 dma_map_xdr with xdr_off set to zero. When there's more than one
 sge, dma_map_xdr() sets up the SEND sge's so they all point to the
 xdr_buf's head.

 The current Linux NFS/RDMA client always provides a reply chunk or
 a write list when performing an NFS READ over RDMA. Therefore, it
 does not exercise this particular case. The Linux server has never
 had to use more than one extra sge for building RPC/RDMA replies
 with a Linux client.

 However, an NFS/RDMA client _is_ allowed to send small NFS READs
 without setting up a write list or reply chunk. The NFS READ reply
 fits entirely within the inline reply buffer in this case. This is
 perhaps a more efficient way of performing NFS READs that the Linux
 NFS/RDMA client may some day adopt.

 Fixes: b432e6b3d9c1 ('svcrdma: Change DMA mapping logic to . . .')
 BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=285
 Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
 Signed-off-by: J. Bruce Fields <bfields@redhat.com>
 [lizf: Backported to 3.4: adjust context]
 Signed-off-by: Zefan Li <lizefan@huawei.com>
 ---
  net/sunrpc/xprtrdma/svc_rdma_sendto.c |   11 ++++++++++-
  1 file changed, 10 insertions(+), 1 deletion(-)

 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
 +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
 @@ -545,6 +545,7 @@ static int send_reply(struct svcxprt_rdm
  {
  	struct ib_send_wr send_wr;
  	struct ib_send_wr inv_wr;
 +	u32 xdr_off;
  	int sge_no;
  	int sge_bytes;
  	int page_no;
 @@ -584,8 +585,8 @@ static int send_reply(struct svcxprt_rdm
  	ctxt->direction = DMA_TO_DEVICE;

  	/* Map the payload indicated by 'byte_count' */
 +	xdr_off = 0;
  	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
 -		int xdr_off = 0;
  		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
  		byte_count -= sge_bytes;
  		if (!vec->frmr) {
 @@ -623,6 +624,14 @@ static int send_reply(struct svcxprt_rdm
  		if (page_no+1 >= sge_no)
  			ctxt->sge[page_no+1].length = 0;
  	}
 +
 +	/* The loop above bumps sc_dma_used for each sge. The
 +	 * xdr_buf.tail gets a separate sge, but resides in the
 +	 * same page as xdr_buf.head. Don't count it twice.
 +	 */
 +	if (sge_no > ctxt->count)
 +		atomic_dec(&rdma->sc_dma_used);
 +
  	BUG_ON(sge_no > rdma->sc_max_sge);
  	memset(&send_wr, 0, sizeof send_wr);
  	ctxt->wr_op = IB_WR_SEND;
	From 9d11b51ce7c150a69e761e30518f294fc73d55ff Mon Sep 17 00:00:00 2001
	From: Chuck Lever <chuck.lever@oracle.com>
	Date: Thu, 9 Jul 2015 16:45:18 -0400
	Subject: svcrdma: Fix send_reply() scatter/gather set-up

	commit 9d11b51ce7c150a69e761e30518f294fc73d55ff upstream.

	The Linux NFS server returns garbage in the data payload of inline
	NFS/RDMA READ replies. These are READs of under 1000 bytes or so
	where the client has not provided either a reply chunk or a write
	list.

	The NFS server delivers the data payload for an NFS READ reply to
	the transport in an xdr_buf page list. If the NFS client did not
	provide a reply chunk or a write list, send_reply() is supposed to
	set up a separate sge for the page containing the READ data, and
	another sge for XDR padding if needed, then post all of the sges via
	a single SEND Work Request.

	The problem is send_reply() does not advance through the xdr_buf
	when setting up scatter/gather entries for SEND WR. It always calls
	dma_map_xdr with xdr_off set to zero. When there's more than one
	sge, dma_map_xdr() sets up the SEND sge's so they all point to the
	xdr_buf's head.

	The current Linux NFS/RDMA client always provides a reply chunk or
	a write list when performing an NFS READ over RDMA. Therefore, it
	does not exercise this particular case. The Linux server has never
	had to use more than one extra sge for building RPC/RDMA replies
	with a Linux client.

	However, an NFS/RDMA client _is_ allowed to send small NFS READs
	without setting up a write list or reply chunk. The NFS READ reply
	fits entirely within the inline reply buffer in this case. This is
	perhaps a more efficient way of performing NFS READs that the Linux
	NFS/RDMA client may some day adopt.

	Fixes: b432e6b3d9c1 ('svcrdma: Change DMA mapping logic to . . .')
	BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=285
	Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
	Signed-off-by: J. Bruce Fields <bfields@redhat.com>
	[lizf: Backported to 3.4: adjust context]
	Signed-off-by: Zefan Li <lizefan@huawei.com>
	---
	net/sunrpc/xprtrdma/svc_rdma_sendto.c \| 11 ++++++++++-
	1 file changed, 10 insertions(+), 1 deletion(-)

	--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
	+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
	@@ -545,6 +545,7 @@ static int send_reply(struct svcxprt_rdm
	{
	struct ib_send_wr send_wr;
	struct ib_send_wr inv_wr;
	+ u32 xdr_off;
	int sge_no;
	int sge_bytes;
	int page_no;
	@@ -584,8 +585,8 @@ static int send_reply(struct svcxprt_rdm
	ctxt->direction = DMA_TO_DEVICE;

	/* Map the payload indicated by 'byte_count' */
	+ xdr_off = 0;
	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
	- int xdr_off = 0;
	sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
	byte_count -= sge_bytes;
	if (!vec->frmr) {
	@@ -623,6 +624,14 @@ static int send_reply(struct svcxprt_rdm
	if (page_no+1 >= sge_no)
	ctxt->sge[page_no+1].length = 0;
	}
	+
	+ /* The loop above bumps sc_dma_used for each sge. The
	+ * xdr_buf.tail gets a separate sge, but resides in the
	+ * same page as xdr_buf.head. Don't count it twice.
	+ */
	+ if (sge_no > ctxt->count)
	+ atomic_dec(&rdma->sc_dma_used);
	+
	BUG_ON(sge_no > rdma->sc_max_sge);
	memset(&send_wr, 0, sizeof send_wr);
	ctxt->wr_op = IB_WR_SEND;