| /* |
| * linux/fs/nfs/direct.c |
| * |
| * High-performance direct I/O for the NFS client |
| * |
| * When an application requests uncached I/O, all read and write requests |
| * are made directly to the server; data stored or fetched via these |
| * requests is not cached in the Linux page cache. The client does not |
| * correct unaligned requests from applications. All requested bytes are |
| * held on permanent storage before a direct write system call returns to |
| * an application. Applications that manage their own data caching, such |
| * as databases, make very good use of direct I/O on local file systems. |
| * |
| * Solaris implements an uncached I/O facility called directio() that |
| * is used for backups and sequential I/O to very large files. Solaris |
| * also supports uncaching whole NFS partitions with "-o forcedirectio," |
| * an undocumented mount option. |
| * |
| * Note that I/O to read in executables (e.g. kernel_read) cannot use |
| * direct (kiobuf) reads because there is no vma backing the passed-in |
| * data buffer. |
| * |
| * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. |
| * |
| * Initial implementation: 12/2001 by Chuck Lever <cel@netapp.com> |
| * |
| * TODO: |
| * |
| * 1. Use concurrent asynchronous network requests rather than |
| * serialized synchronous network requests for normal (non-sync) |
| * direct I/O. |
| */ |
| |
| #include <linux/config.h> |
| #include <linux/sched.h> |
| #include <linux/kernel.h> |
| #include <linux/file.h> |
| #include <linux/errno.h> |
| #include <linux/nfs_fs.h> |
| #include <linux/smp_lock.h> |
| #include <linux/sunrpc/clnt.h> |
| #include <linux/iobuf.h> |
| |
| #include <asm/system.h> |
| #include <asm/uaccess.h> |
| |
| #define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) |
| #define VERF_SIZE (2 * sizeof(__u32)) |
| |
| static inline int |
| nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg) |
| { |
| int result; |
| struct inode * inode = file->f_dentry->d_inode; |
| struct nfs_fattr fattr; |
| struct rpc_message msg; |
| struct nfs_readres res = { &fattr, arg->count, 0 }; |
| |
| #ifdef CONFIG_NFS_V3 |
| msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? |
| NFS3PROC_READ : NFSPROC_READ; |
| #else |
| msg.rpc_proc = NFSPROC_READ; |
| #endif |
| msg.rpc_argp = arg; |
| msg.rpc_resp = &res; |
| |
| lock_kernel(); |
| msg.rpc_cred = nfs_file_cred(file); |
| fattr.valid = 0; |
| result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); |
| nfs_refresh_inode(inode, &fattr); |
| unlock_kernel(); |
| |
| return result; |
| } |
| |
| static inline int |
| nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg, |
| struct nfs_writeverf *verf) |
| { |
| int result; |
| struct inode *inode = file->f_dentry->d_inode; |
| struct nfs_fattr fattr; |
| struct rpc_message msg; |
| struct nfs_writeres res = { &fattr, verf, 0 }; |
| |
| #ifdef CONFIG_NFS_V3 |
| msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? |
| NFS3PROC_WRITE : NFSPROC_WRITE; |
| #else |
| msg.rpc_proc = NFSPROC_WRITE; |
| #endif |
| msg.rpc_argp = arg; |
| msg.rpc_resp = &res; |
| |
| lock_kernel(); |
| msg.rpc_cred = get_rpccred(nfs_file_cred(file)); |
| fattr.valid = 0; |
| result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); |
| nfs_write_attributes(inode, &fattr); |
| put_rpccred(msg.rpc_cred); |
| unlock_kernel(); |
| |
| #ifdef CONFIG_NFS_V3 |
| if (NFS_PROTO(inode)->version == 3) { |
| if (result > 0) { |
| if ((arg->stable == NFS_FILE_SYNC) && |
| (verf->committed != NFS_FILE_SYNC)) { |
| printk(KERN_ERR |
| "%s: server didn't sync stable write request\n", |
| __FUNCTION__); |
| return -EIO; |
| } |
| |
| if (result != arg->count) { |
| printk(KERN_INFO |
| "%s: short write, count=%u, result=%d\n", |
| __FUNCTION__, arg->count, result); |
| } |
| } |
| return result; |
| } else { |
| #endif |
| verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */ |
| if (result == 0) |
| return arg->count; |
| return result; |
| #ifdef CONFIG_NFS_V3 |
| } |
| #endif |
| } |
| |
| #ifdef CONFIG_NFS_V3 |
| static inline int |
| nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, |
| struct nfs_writeverf *verf) |
| { |
| int result; |
| struct nfs_fattr fattr; |
| struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0, |
| NULL }; |
| struct nfs_writeres res = { &fattr, verf, 0 }; |
| struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL }; |
| |
| fattr.valid = 0; |
| |
| lock_kernel(); |
| result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); |
| nfs_write_attributes(inode, &fattr); |
| unlock_kernel(); |
| |
| return result; |
| } |
| #else |
| static inline int |
| nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, |
| struct nfs_writeverf *verf) |
| { |
| return 0; |
| } |
| #endif |
| |
| /* |
| * Walk through the iobuf and create an iovec for each "rsize" bytes. |
| */ |
| static int |
| nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset, |
| size_t count) |
| { |
| int curpage, total; |
| int result = 0; |
| struct inode *inode = file->f_dentry->d_inode; |
| int rsize = NFS_SERVER(inode)->rsize; |
| struct page *pages[NFS_READ_MAXIOV]; |
| struct nfs_readargs args = { NFS_FH(inode), offset, 0, iobuf->offset, |
| pages }; |
| |
| total = 0; |
| curpage = 0; |
| while (count) { |
| int len, request; |
| struct page **dest = pages; |
| |
| request = count; |
| if (count > rsize) |
| request = rsize; |
| args.count = request; |
| args.offset = offset; |
| args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; |
| len = PAGE_SIZE - args.pgbase; |
| |
| do { |
| struct page *page = iobuf->maplist[curpage]; |
| |
| if (curpage >= iobuf->nr_pages || !page) { |
| result = -EFAULT; |
| goto out_err; |
| } |
| |
| *dest++ = page; |
| /* zero after the first iov */ |
| if (request < len) |
| break; |
| request -= len; |
| len = PAGE_SIZE; |
| curpage++; |
| } while (request != 0); |
| |
| result = nfs_direct_read_rpc(file, &args); |
| |
| if (result < 0) |
| break; |
| |
| total += result; |
| if (result < args.count) /* NFSv2ism */ |
| break; |
| count -= result; |
| offset += result; |
| }; |
| out_err: |
| if (!total) |
| return result; |
| return total; |
| } |
| |
| /* |
| * Walk through the iobuf and create an iovec for each "wsize" bytes. |
| * If only one network write is necessary, or if the O_SYNC flag or |
| * 'sync' mount option are present, or if this is a V2 inode, use |
| * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. |
| * |
| * The mechanics of this function are much the same as nfs_direct_read, |
| * with the added complexity of committing unstable writes. |
| */ |
| static int |
| nfs_direct_write(struct file *file, struct kiobuf *iobuf, |
| loff_t offset, size_t count) |
| { |
| int curpage, total; |
| int need_commit = 0; |
| int result = 0; |
| loff_t save_offset = offset; |
| struct inode *inode = file->f_dentry->d_inode; |
| int wsize = NFS_SERVER(inode)->wsize; |
| struct nfs_writeverf first_verf, ret_verf; |
| struct page *pages[NFS_WRITE_MAXIOV]; |
| struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0, |
| pages }; |
| |
| #ifdef CONFIG_NFS_V3 |
| if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && |
| (!IS_SYNC(inode))) |
| args.stable = NFS_UNSTABLE; |
| #endif |
| |
| retry: |
| total = 0; |
| curpage = 0; |
| while (count) { |
| int len, request; |
| struct page **dest = pages; |
| |
| request = count; |
| if (count > wsize) |
| request = wsize; |
| args.count = request; |
| args.offset = offset; |
| args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; |
| len = PAGE_SIZE - args.pgbase; |
| |
| do { |
| struct page *page = iobuf->maplist[curpage]; |
| |
| if (curpage >= iobuf->nr_pages || !page) { |
| result = -EFAULT; |
| goto out_err; |
| } |
| |
| *dest++ = page; |
| /* zero after the first iov */ |
| if (request < len) |
| break; |
| request -= len; |
| len = PAGE_SIZE; |
| curpage++; |
| } while (request != 0); |
| |
| result = nfs_direct_write_rpc(file, &args, &ret_verf); |
| |
| if (result < 0) |
| break; |
| |
| if (!total) |
| memcpy(&first_verf.verifier, &ret_verf.verifier, |
| VERF_SIZE); |
| if (ret_verf.committed != NFS_FILE_SYNC) { |
| need_commit = 1; |
| if (memcmp(&first_verf.verifier, &ret_verf.verifier, |
| VERF_SIZE)) |
| goto print_retry; |
| } |
| |
| total += result; |
| count -= result; |
| offset += result; |
| }; |
| |
| out_err: |
| /* |
| * Commit data written so far, even in the event of an error |
| */ |
| if (need_commit) { |
| if (nfs_direct_commit_rpc(inode, save_offset, |
| iobuf->length - count, &ret_verf)) |
| goto print_retry; |
| if (memcmp(&first_verf.verifier, &ret_verf.verifier, |
| VERF_SIZE)) |
| goto print_retry; |
| } |
| |
| if (!total) |
| return result; |
| return total; |
| |
| print_retry: |
| printk(KERN_INFO "%s: detected server restart; retrying with FILE_SYNC\n", |
| __FUNCTION__); |
| args.stable = NFS_FILE_SYNC; |
| offset = save_offset; |
| count = iobuf->length; |
| goto retry; |
| } |
| |
| /* |
| * Read or write data, moving the data directly to/from the |
| * application's buffer without caching in the page cache. |
| * |
| * Rules for direct I/O |
| * |
| * 1. block size = 512 bytes or more |
| * 2. file byte offset is block aligned |
| * 3. byte count is a multiple of block size |
| * 4. user buffer is not aligned |
| * 5. user buffer is faulted in and pinned |
| * |
| * These are verified before we get here. |
| */ |
| int |
| nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf, |
| unsigned long blocknr, int blocksize) |
| { |
| int result = -EINVAL; |
| size_t count = iobuf->length; |
| struct dentry *dentry = file->f_dentry; |
| struct inode *inode = dentry->d_inode; |
| loff_t offset = (loff_t) blocknr << inode->i_blkbits; |
| |
| switch (rw) { |
| case READ: |
| dfprintk(VFS, |
| "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n", |
| dentry->d_parent->d_name.name, |
| dentry->d_name.name, offset, count); |
| |
| result = nfs_direct_read(file, iobuf, offset, count); |
| break; |
| case WRITE: |
| dfprintk(VFS, |
| "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n", |
| dentry->d_parent->d_name.name, |
| dentry->d_name.name, offset, count); |
| |
| result = nfs_direct_write(file, iobuf, offset, count); |
| break; |
| default: |
| break; |
| } |
| |
| dfprintk(VFS, "NFS: direct_IO result = %d\n", result); |
| return result; |
| } |