Commit 40859d7e authored by Chuck Lever's avatar Chuck Lever Committed by Trond Myklebust

NFS: support large reads and writes on the wire

 Most NFS server implementations allow up to 64KB reads and writes on the
 wire.  The Solaris NFS server allows up to a megabyte, for instance.

 Now the Linux NFS client supports transfer sizes up to 1MB, too.  This will
 help reduce protocol and context switch overhead on read/write intensive NFS
 workloads, and support larger atomic read and write operations on servers
 that support them.

 Test-plan:
 Connectathon and iozone on mount point with wsize=rsize>32768 over TCP.
 Tests with NFS over UDP to verify the maximum RPC payload size cap.
Signed-off-by: default avatarChuck Lever <cel@netapp.com>
Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
parent 325cfed9
......@@ -154,6 +154,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
struct list_head *list;
struct nfs_direct_req *dreq;
unsigned int reads = 0;
unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
if (!dreq)
......@@ -167,7 +168,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
list = &dreq->list;
for(;;) {
struct nfs_read_data *data = nfs_readdata_alloc();
struct nfs_read_data *data = nfs_readdata_alloc(rpages);
if (unlikely(!data)) {
while (!list_empty(list)) {
......@@ -431,7 +432,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode,
struct nfs_writeverf first_verf;
struct nfs_write_data *wdata;
wdata = nfs_writedata_alloc();
wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
if (!wdata)
return -ENOMEM;
......
......@@ -221,10 +221,10 @@ nfs_calc_block_size(u64 tsize)
static inline unsigned long
nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
{
if (bsize < 1024)
bsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE)
bsize = NFS_MAX_FILE_IO_BUFFER_SIZE;
if (bsize < NFS_MIN_FILE_IO_SIZE)
bsize = NFS_DEF_FILE_IO_SIZE;
else if (bsize >= NFS_MAX_FILE_IO_SIZE)
bsize = NFS_MAX_FILE_IO_SIZE;
return nfs_block_bits(bsize, nrbitsp);
}
......@@ -307,20 +307,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
if (server->rsize > max_rpc_payload)
server->rsize = max_rpc_payload;
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (server->rpages > NFS_READ_MAXIOV) {
server->rpages = NFS_READ_MAXIOV;
server->rsize = server->rpages << PAGE_CACHE_SHIFT;
}
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (server->wpages > NFS_WRITE_MAXIOV) {
server->wpages = NFS_WRITE_MAXIOV;
server->wsize = server->wpages << PAGE_CACHE_SHIFT;
}
if (sb->s_blocksize == 0)
sb->s_blocksize = nfs_block_bits(server->wsize,
......
......@@ -296,8 +296,8 @@ static int __init root_nfs_name(char *name)
nfs_port = -1;
nfs_data.version = NFS_MOUNT_VERSION;
nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.acregmin = 3;
nfs_data.acregmax = 60;
nfs_data.acdirmin = 30;
......
......@@ -83,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
int result;
struct nfs_read_data *rdata;
rdata = nfs_readdata_alloc();
rdata = nfs_readdata_alloc(1);
if (!rdata)
return -ENOMEM;
......@@ -283,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
nbytes = req->wb_bytes;
for(;;) {
data = nfs_readdata_alloc();
data = nfs_readdata_alloc(1);
if (!data)
goto out_bad;
INIT_LIST_HEAD(&data->pages);
......@@ -339,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
return nfs_pagein_multi(head, inode);
data = nfs_readdata_alloc();
data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
if (!data)
goto out_bad;
......
......@@ -89,18 +89,33 @@ static mempool_t *nfs_commit_mempool;
static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
static inline struct nfs_write_data *nfs_commit_alloc(void)
static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_commit_mempool);
p = NULL;
}
}
}
return p;
}
static inline void nfs_commit_free(struct nfs_write_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_commit_mempool);
}
......@@ -167,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
int result, written = 0;
struct nfs_write_data *wdata;
wdata = nfs_writedata_alloc();
wdata = nfs_writedata_alloc(1);
if (!wdata)
return -ENOMEM;
......@@ -909,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
nbytes = req->wb_bytes;
for (;;) {
data = nfs_writedata_alloc();
data = nfs_writedata_alloc(1);
if (!data)
goto out_bad;
list_add(&data->pages, &list);
......@@ -973,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE)
return nfs_flush_multi(head, inode, how);
data = nfs_writedata_alloc();
data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
if (!data)
goto out_bad;
......@@ -1241,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
* Commit dirty pages
*/
static int
nfs_commit_list(struct list_head *head, int how)
nfs_commit_list(struct inode *inode, struct list_head *head, int how)
{
struct nfs_write_data *data;
struct nfs_page *req;
data = nfs_commit_alloc();
data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
if (!data)
goto out_bad;
......@@ -1351,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how)
res = nfs_scan_commit(inode, &head, 0, 0);
spin_unlock(&nfsi->req_lock);
if (res) {
error = nfs_commit_list(&head, how);
error = nfs_commit_list(inode, &head, how);
if (error < 0)
return error;
}
......
......@@ -38,9 +38,6 @@
# define NFS_DEBUG
#endif
#define NFS_MAX_FILE_IO_BUFFER_SIZE 32768
#define NFS_DEF_FILE_IO_BUFFER_SIZE 4096
/* Default timeout values */
#define NFS_MAX_UDP_TIMEOUT (60*HZ)
#define NFS_MAX_TCP_TIMEOUT (600*HZ)
......@@ -462,18 +459,33 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page)
*/
extern mempool_t *nfs_wdata_mempool;
static inline struct nfs_write_data *nfs_writedata_alloc(void)
static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
{
struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_wdata_mempool);
p = NULL;
}
}
}
return p;
}
static inline void nfs_writedata_free(struct nfs_write_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_wdata_mempool);
}
......@@ -492,16 +504,33 @@ extern void nfs_readdata_release(void *data);
*/
extern mempool_t *nfs_rdata_mempool;
static inline struct nfs_read_data *nfs_readdata_alloc(void)
static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
{
struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
if (p)
if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_rdata_mempool);
p = NULL;
}
}
}
return p;
}
static inline void nfs_readdata_free(struct nfs_read_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_rdata_mempool);
}
......
......@@ -4,6 +4,16 @@
#include <linux/sunrpc/xprt.h>
#include <linux/nfsacl.h>
/*
* To change the maximum rsize and wsize supported by the NFS client, adjust
* NFS_MAX_FILE_IO_SIZE. 64KB is a typical maximum, but some servers can
* support a megabyte or more. The default is left at 4096 bytes, which is
* reasonable for NFS over UDP.
*/
#define NFS_MAX_FILE_IO_SIZE (1048576U)
#define NFS_DEF_FILE_IO_SIZE (4096U)
#define NFS_MIN_FILE_IO_SIZE (1024U)
struct nfs4_fsid {
__u64 major;
__u64 minor;
......@@ -215,12 +225,6 @@ struct nfs4_delegreturnargs {
/*
* Arguments to the read call.
*/
#define NFS_READ_MAXIOV (9U)
#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
#error "NFS_READ_MAXIOV is too large"
#endif
struct nfs_readargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
......@@ -239,11 +243,6 @@ struct nfs_readres {
/*
* Arguments to the write call.
*/
#define NFS_WRITE_MAXIOV (9U)
#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
#error "NFS_WRITE_MAXIOV is too large"
#endif
struct nfs_writeargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
......@@ -674,6 +673,8 @@ struct nfs4_server_caps_res {
struct nfs_page;
#define NFS_PAGEVEC_SIZE (8U)
struct nfs_read_data {
int flags;
struct rpc_task task;
......@@ -682,13 +683,14 @@ struct nfs_read_data {
struct nfs_fattr fattr; /* fattr storage */
struct list_head pages; /* Coalesced read requests */
struct nfs_page *req; /* multi ops per nfs_page */
struct page *pagevec[NFS_READ_MAXIOV];
struct page **pagevec;
struct nfs_readargs args;
struct nfs_readres res;
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_read_data *, int);
struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};
struct nfs_write_data {
......@@ -700,13 +702,14 @@ struct nfs_write_data {
struct nfs_writeverf verf;
struct list_head pages; /* Coalesced requests we wish to flush */
struct nfs_page *req; /* multi ops per nfs_page */
struct page *pagevec[NFS_WRITE_MAXIOV];
struct page **pagevec;
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_write_data *, int);
struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};
struct nfs_access_entry;
......
......@@ -134,11 +134,6 @@ xdr_adjust_iovec(struct kvec *iov, u32 *p)
return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base);
}
/*
* Maximum number of iov's we use.
*/
#define MAX_IOVEC (12)
/*
* XDR buffer helper functions
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment