| From sjayaraman@suse.de Wed Dec 3 08:51:11 2008 |
| From: Steve French <sfrench@us.ibm.com> |
| Date: Fri, 21 Nov 2008 12:58:40 +0530 |
| Subject: cifs: Reduce number of socket retries in large write path |
| To: stable@kernel.org |
| Cc: Steve French <smfrench@gmail.com>, Jeff Layton <jlayton@redhat.com>, Shirish S Pargaonkar <shirishp@us.ibm.com> |
| Message-ID: <49266328.4000409@suse.de> |
| |
| From: Steve French <sfrench@us.ibm.com> |
| |
| Backport of upstream commit edf1ae403896cb7750800508b14996ba6be39a53 |
| for -stable. |
| |
| [CIFS] Reduce number of socket retries in large write path |
| |
| CIFS in some heavy stress conditions cifs could get EAGAIN |
| repeatedly in smb_send2 which led to repeated retries and eventually |
| failure of large writes which could lead to data corruption. |
| |
| There are three changes that were suggested by various network |
| developers: |
| |
| 1) convert cifs from non-blocking to blocking tcp sendmsg |
| (we left in the retry on failure) |
| 2) change cifs to not set sendbuf and rcvbuf size for the socket |
| (let tcp autotune the buffer sizes since that works much better |
| in the TCP stack now) |
| 3) if we have a partial frame sent in smb_send2, mark the tcp |
| session as invalid (close the socket and reconnect) so we do |
| not corrupt the remaining part of the SMB with the beginning |
| of the next SMB. |
| |
| This does not appear to hurt performance measurably and has |
| been run in various scenarios, but it definately removes |
| a corruption that we were seeing in some high stress |
| test cases. |
| |
| Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com> |
| Signed-off-by: Steve French <sfrench@us.ibm.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| |
| --- |
| fs/cifs/cifsglob.h | 2 + |
| fs/cifs/cifsproto.h | 2 - |
| fs/cifs/connect.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- |
| fs/cifs/transport.c | 41 +++++++++++++++++++++++++++--------- |
| 4 files changed, 77 insertions(+), 26 deletions(-) |
| |
| --- a/fs/cifs/cifsglob.h |
| +++ b/fs/cifs/cifsglob.h |
| @@ -141,6 +141,8 @@ struct TCP_Server_Info { |
| char versionMajor; |
| char versionMinor; |
| bool svlocal:1; /* local server or remote */ |
| + bool noblocksnd; /* use blocking sendmsg */ |
| + bool noautotune; /* do not autotune send buf sizes */ |
| atomic_t socketUseCount; /* number of open cifs sessions on socket */ |
| atomic_t inFlight; /* number of requests on the wire to server */ |
| #ifdef CONFIG_CIFS_STATS2 |
| --- a/fs/cifs/cifsproto.h |
| +++ b/fs/cifs/cifsproto.h |
| @@ -36,7 +36,7 @@ extern void cifs_buf_release(void *); |
| extern struct smb_hdr *cifs_small_buf_get(void); |
| extern void cifs_small_buf_release(void *); |
| extern int smb_send(struct socket *, struct smb_hdr *, |
| - unsigned int /* length */ , struct sockaddr *); |
| + unsigned int /* length */ , struct sockaddr *, bool); |
| extern unsigned int _GetXid(void); |
| extern void _FreeXid(unsigned int); |
| #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid)); |
| --- a/fs/cifs/connect.c |
| +++ b/fs/cifs/connect.c |
| @@ -90,6 +90,8 @@ struct smb_vol { |
| bool nocase:1; /* request case insensitive filenames */ |
| bool nobrl:1; /* disable sending byte range locks to srv */ |
| bool seal:1; /* request transport encryption on share */ |
| + bool noblocksnd:1; |
| + bool noautotune:1; |
| unsigned int rsize; |
| unsigned int wsize; |
| unsigned int sockopt; |
| @@ -100,9 +102,11 @@ struct smb_vol { |
| static int ipv4_connect(struct sockaddr_in *psin_server, |
| struct socket **csocket, |
| char *netb_name, |
| - char *server_netb_name); |
| + char *server_netb_name, |
| + bool noblocksnd, |
| + bool nosndbuf); /* ipv6 never set sndbuf size */ |
| static int ipv6_connect(struct sockaddr_in6 *psin_server, |
| - struct socket **csocket); |
| + struct socket **csocket, bool noblocksnd); |
| |
| |
| /* |
| @@ -188,12 +192,13 @@ cifs_reconnect(struct TCP_Server_Info *s |
| try_to_freeze(); |
| if (server->protocolType == IPV6) { |
| rc = ipv6_connect(&server->addr.sockAddr6, |
| - &server->ssocket); |
| + &server->ssocket, server->noautotune); |
| } else { |
| rc = ipv4_connect(&server->addr.sockAddr, |
| &server->ssocket, |
| server->workstation_RFC1001_name, |
| - server->server_RFC1001_name); |
| + server->server_RFC1001_name, |
| + server->noblocksnd, server->noautotune); |
| } |
| if (rc) { |
| cFYI(1, ("reconnect error %d", rc)); |
| @@ -409,8 +414,14 @@ incomplete_rcv: |
| msleep(1); /* minimum sleep to prevent looping |
| allowing socket to clear and app threads to set |
| tcpStatus CifsNeedReconnect if server hung */ |
| - if (pdu_length < 4) |
| + if (pdu_length < 4) { |
| + iov.iov_base = (4 - pdu_length) + |
| + (char *)smb_buffer; |
| + iov.iov_len = pdu_length; |
| + smb_msg.msg_control = NULL; |
| + smb_msg.msg_controllen = 0; |
| goto incomplete_rcv; |
| + } |
| else |
| continue; |
| } else if (length <= 0) { |
| @@ -1186,6 +1197,10 @@ cifs_parse_mount_options(char *options, |
| /* ignore */ |
| } else if (strnicmp(data, "rw", 2) == 0) { |
| vol->rw = true; |
| + } else if (strnicmp(data, "noblocksnd", 11) == 0) { |
| + vol->noblocksnd = true; |
| + } else if (strnicmp(data, "noautotune", 10) == 0) { |
| + vol->noautotune = true; |
| } else if ((strnicmp(data, "suid", 4) == 0) || |
| (strnicmp(data, "nosuid", 6) == 0) || |
| (strnicmp(data, "exec", 4) == 0) || |
| @@ -1506,7 +1521,8 @@ static void rfc1002mangle(char *target, |
| |
| static int |
| ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, |
| - char *netbios_name, char *target_name) |
| + char *netbios_name, char *target_name, |
| + bool noblocksnd, bool noautotune) |
| { |
| int rc = 0; |
| int connected = 0; |
| @@ -1578,11 +1594,15 @@ ipv4_connect(struct sockaddr_in *psin_se |
| (*csocket)->sk->sk_sndbuf, |
| (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo)); |
| (*csocket)->sk->sk_rcvtimeo = 7 * HZ; |
| + if (!noblocksnd) |
| + (*csocket)->sk->sk_sndtimeo = 3 * HZ; |
| /* make the bufsizes depend on wsize/rsize and max requests */ |
| - if ((*csocket)->sk->sk_sndbuf < (200 * 1024)) |
| - (*csocket)->sk->sk_sndbuf = 200 * 1024; |
| - if ((*csocket)->sk->sk_rcvbuf < (140 * 1024)) |
| - (*csocket)->sk->sk_rcvbuf = 140 * 1024; |
| + if (noautotune) { |
| + if ((*csocket)->sk->sk_sndbuf < (200 * 1024)) |
| + (*csocket)->sk->sk_sndbuf = 200 * 1024; |
| + if ((*csocket)->sk->sk_rcvbuf < (140 * 1024)) |
| + (*csocket)->sk->sk_rcvbuf = 140 * 1024; |
| + } |
| |
| /* send RFC1001 sessinit */ |
| if (psin_server->sin_port == htons(RFC1001_PORT)) { |
| @@ -1619,7 +1639,7 @@ ipv4_connect(struct sockaddr_in *psin_se |
| /* sizeof RFC1002_SESSION_REQUEST with no scope */ |
| smb_buf->smb_buf_length = 0x81000044; |
| rc = smb_send(*csocket, smb_buf, 0x44, |
| - (struct sockaddr *)psin_server); |
| + (struct sockaddr *)psin_server, noblocksnd); |
| kfree(ses_init_buf); |
| msleep(1); /* RFC1001 layer in at least one server |
| requires very short break before negprot |
| @@ -1639,7 +1659,8 @@ ipv4_connect(struct sockaddr_in *psin_se |
| } |
| |
| static int |
| -ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket) |
| +ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket, |
| + bool noblocksnd) |
| { |
| int rc = 0; |
| int connected = 0; |
| @@ -1708,6 +1729,8 @@ ipv6_connect(struct sockaddr_in6 *psin_s |
| the default. sock_setsockopt not used because it expects |
| user space buffer */ |
| (*csocket)->sk->sk_rcvtimeo = 7 * HZ; |
| + if (!noblocksnd) |
| + (*csocket)->sk->sk_sndtimeo = 3 * HZ; |
| |
| return rc; |
| } |
| @@ -1961,11 +1984,14 @@ cifs_mount(struct super_block *sb, struc |
| cFYI(1, ("attempting ipv6 connect")); |
| /* BB should we allow ipv6 on port 139? */ |
| /* other OS never observed in Wild doing 139 with v6 */ |
| - rc = ipv6_connect(&sin_server6, &csocket); |
| + rc = ipv6_connect(&sin_server6, &csocket, |
| + volume_info.noblocksnd); |
| } else |
| rc = ipv4_connect(&sin_server, &csocket, |
| - volume_info.source_rfc1001_name, |
| - volume_info.target_rfc1001_name); |
| + volume_info.source_rfc1001_name, |
| + volume_info.target_rfc1001_name, |
| + volume_info.noblocksnd, |
| + volume_info.noautotune); |
| if (rc < 0) { |
| cERROR(1, ("Error connecting to IPv4 socket. " |
| "Aborting operation")); |
| @@ -1980,6 +2006,8 @@ cifs_mount(struct super_block *sb, struc |
| sock_release(csocket); |
| goto out; |
| } else { |
| + srvTcp->noblocksnd = volume_info.noblocksnd; |
| + srvTcp->noautotune = volume_info.noautotune; |
| memcpy(&srvTcp->addr.sockAddr, &sin_server, |
| sizeof(struct sockaddr_in)); |
| atomic_set(&srvTcp->inFlight, 0); |
| --- a/fs/cifs/transport.c |
| +++ b/fs/cifs/transport.c |
| @@ -162,7 +162,7 @@ void DeleteTconOplockQEntries(struct cif |
| |
| int |
| smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer, |
| - unsigned int smb_buf_length, struct sockaddr *sin) |
| + unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd) |
| { |
| int rc = 0; |
| int i = 0; |
| @@ -179,7 +179,10 @@ smb_send(struct socket *ssocket, struct |
| smb_msg.msg_namelen = sizeof(struct sockaddr); |
| smb_msg.msg_control = NULL; |
| smb_msg.msg_controllen = 0; |
| - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/ |
| + if (noblocksnd) |
| + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; |
| + else |
| + smb_msg.msg_flags = MSG_NOSIGNAL; |
| |
| /* smb header is converted in header_assemble. bcc and rest of SMB word |
| area, and byte area if necessary, is converted to littleendian in |
| @@ -230,8 +233,8 @@ smb_send(struct socket *ssocket, struct |
| } |
| |
| static int |
| -smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, |
| - struct sockaddr *sin) |
| +smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, |
| + struct sockaddr *sin, bool noblocksnd) |
| { |
| int rc = 0; |
| int i = 0; |
| @@ -241,6 +244,7 @@ smb_send2(struct socket *ssocket, struct |
| unsigned int total_len; |
| int first_vec = 0; |
| unsigned int smb_buf_length = smb_buffer->smb_buf_length; |
| + struct socket *ssocket = server->ssocket; |
| |
| if (ssocket == NULL) |
| return -ENOTSOCK; /* BB eventually add reconnect code here */ |
| @@ -249,7 +253,10 @@ smb_send2(struct socket *ssocket, struct |
| smb_msg.msg_namelen = sizeof(struct sockaddr); |
| smb_msg.msg_control = NULL; |
| smb_msg.msg_controllen = 0; |
| - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/ |
| + if (noblocksnd) |
| + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; |
| + else |
| + smb_msg.msg_flags = MSG_NOSIGNAL; |
| |
| /* smb header is converted in header_assemble. bcc and rest of SMB word |
| area, and byte area if necessary, is converted to littleendian in |
| @@ -313,6 +320,16 @@ smb_send2(struct socket *ssocket, struct |
| i = 0; /* in case we get ENOSPC on the next send */ |
| } |
| |
| + if ((total_len > 0) && (total_len != smb_buf_length + 4)) { |
| + cFYI(1, ("partial send (%d remaining), terminating session", |
| + total_len)); |
| + /* If we have only sent part of an SMB then the next SMB |
| + could be taken as the remainder of this one. We need |
| + to kill the socket so the server throws away the partial |
| + SMB */ |
| + server->tcpStatus = CifsNeedReconnect; |
| + } |
| + |
| if (rc < 0) { |
| cERROR(1, ("Error %d sending data on socket to server", rc)); |
| } else |
| @@ -519,8 +536,9 @@ SendReceive2(const unsigned int xid, str |
| #ifdef CONFIG_CIFS_STATS2 |
| atomic_inc(&ses->server->inSend); |
| #endif |
| - rc = smb_send2(ses->server->ssocket, iov, n_vec, |
| - (struct sockaddr *) &(ses->server->addr.sockAddr)); |
| + rc = smb_send2(ses->server, iov, n_vec, |
| + (struct sockaddr *) &(ses->server->addr.sockAddr), |
| + ses->server->noblocksnd); |
| #ifdef CONFIG_CIFS_STATS2 |
| atomic_dec(&ses->server->inSend); |
| midQ->when_sent = jiffies; |
| @@ -712,7 +730,8 @@ SendReceive(const unsigned int xid, stru |
| atomic_inc(&ses->server->inSend); |
| #endif |
| rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, |
| - (struct sockaddr *) &(ses->server->addr.sockAddr)); |
| + (struct sockaddr *) &(ses->server->addr.sockAddr), |
| + ses->server->noblocksnd); |
| #ifdef CONFIG_CIFS_STATS2 |
| atomic_dec(&ses->server->inSend); |
| midQ->when_sent = jiffies; |
| @@ -852,7 +871,8 @@ send_nt_cancel(struct cifsTconInfo *tcon |
| return rc; |
| } |
| rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, |
| - (struct sockaddr *) &(ses->server->addr.sockAddr)); |
| + (struct sockaddr *) &(ses->server->addr.sockAddr), |
| + ses->server->noblocksnd); |
| up(&ses->server->tcpSem); |
| return rc; |
| } |
| @@ -942,7 +962,8 @@ SendReceiveBlockingLock(const unsigned i |
| atomic_inc(&ses->server->inSend); |
| #endif |
| rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, |
| - (struct sockaddr *) &(ses->server->addr.sockAddr)); |
| + (struct sockaddr *) &(ses->server->addr.sockAddr), |
| + ses->server->noblocksnd); |
| #ifdef CONFIG_CIFS_STATS2 |
| atomic_dec(&ses->server->inSend); |
| midQ->when_sent = jiffies; |