Commit c03e7607 authored by Chuck Lever's avatar Chuck Lever Committed by Linus Torvalds

[PATCH] initial support for NFS direct I/O for 2.5

This adds initial support for NFS direct I/O in the 2.5 kernel.  many
have asked for this support to be included in 2.5.  this patch does not
provide working NFS direct I/O, but i'm sending what i have now so that
it can be included before October 20.

NFS direct I/O is enabled by its very own kernel config option.  when
enabled, the NFS client won't build to prevent people from using this and
possibly corrupting their NFS files.  later i will send a patch that
finishes the implementation.

[ Config option currently disabled ]
parent eb582eba
...@@ -514,6 +514,27 @@ CONFIG_NFS_V3 ...@@ -514,6 +514,27 @@ CONFIG_NFS_V3
If unsure, say N. If unsure, say N.
CONFIG_NFS_DIRECTIO
This option enables applications to perform uncached I/O on files
in NFS file systems using the O_DIRECT open() flag. When O_DIRECT
is set for a file, its data is not cached in the system's page
cache. Data is moved to and from user-level application buffers
directly. Unlike local disk-based file systems, NFS O_DIRECT has
no alignment restrictions.
Unless your program is designed to use O_DIRECT properly, you are
much better off allowing the NFS client to manage data caching for
you. Misusing O_DIRECT can cause poor server performance or network
storms. This kernel build option defaults OFF to avoid exposing
system administrators unwittingly to a potentially hazardous
feature.
For more details on NFS O_DIRECT, see fs/nfs/direct.c.
If unsure, say N. This reduces the size of the NFS client, and
causes open() to return EINVAL if a file residing in NFS is
opened with the O_DIRECT flag.
CONFIG_ROOT_NFS CONFIG_ROOT_NFS
If you want your Linux box to mount its whole root file system (the If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the one containing the directory /) from some other computer over the
......
...@@ -8,6 +8,7 @@ nfs-y := dir.o file.o flushd.o inode.o nfs2xdr.o pagelist.o \ ...@@ -8,6 +8,7 @@ nfs-y := dir.o file.o flushd.o inode.o nfs2xdr.o pagelist.o \
proc.o read.o symlink.o unlink.o write.o proc.o read.o symlink.o unlink.o write.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
nfs-objs := $(nfs-y) nfs-objs := $(nfs-y)
include $(TOPDIR)/Rules.make include $(TOPDIR)/Rules.make
/*
* linux/fs/nfs/direct.c
*
* Copyright (C) 2001 by Chuck Lever <cel@netapp.com>
*
* High-performance uncached I/O for the Linux NFS client
*
* There are important applications whose performance or correctness
* depends on uncached access to file data. Database clusters
* (multiple copies of the same instance running on separate hosts)
* implement their own cache coherency protocol that subsumes file
* system cache protocols. Applications that process datasets
* considerably larger than the client's memory do not always benefit
* from a local cache. A streaming video server, for instance, has no
* need to cache the contents of a file.
*
* When an application requests uncached I/O, all read and write requests
* are made directly to the server; data stored or fetched via these
* requests is not cached in the Linux page cache. The client does not
* correct unaligned requests from applications. All requested bytes are
* held on permanent storage before a direct write system call returns to
* an application.
*
* Solaris implements an uncached I/O facility called directio() that
* is used for backups and sequential I/O to very large files. Solaris
* also supports uncaching whole NFS partitions with "-o forcedirectio,"
* an undocumented mount option.
*
* Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
*
* 18 Dec 2001 Initial implementation for 2.4 --cel
* 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
* 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel
*
*/
#include <linux/config.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/errno.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS)
#define VERF_SIZE (2 * sizeof(__u32))
/**
* nfs_get_user_pages - find and set up page representing user buffer
* addr: user-space address of target buffer
* size: total size in bytes of target buffer
* @pages: returned array of page struct pointers underlying target buffer
* write: whether or not buffer is target of a write operation
*/
static inline int
nfs_get_user_pages(unsigned long addr, size_t size,
struct page ***pages, int rw)
{
int result = -ENOMEM;
unsigned page_count = (unsigned) size >> PAGE_SHIFT;
unsigned array_size = (page_count * sizeof(struct page *)) + 2U;
*pages = (struct page **) kmalloc(array_size, GFP_KERNEL);
if (*pages) {
down_read(&current->mm->mmap_sem);
result = get_user_pages(current, current->mm, addr,
page_count, (rw == WRITE), 0,
*pages, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0)
printk(KERN_ERR "%s: get_user_pages result %d\n",
__FUNCTION__, result);
}
return result;
}
/**
* nfs_free_user_pages - tear down page struct array
* @pages: array of page struct pointers underlying target buffer
*/
static inline void
nfs_free_user_pages(struct page **pages, unsigned count)
{
unsigned page = 0;
while (count--)
page_cache_release(pages[page++]);
kfree(pages);
}
/**
* nfs_iov2pagelist - convert an array of iovecs to a list of page requests
* @inode: inode of target file
* @cred: credentials of user who requested I/O
* @iov: array of vectors that define I/O buffer
* offset: where in file to begin the read
* nr_segs: size of iovec array
* @requests: append new page requests to this list head
*/
static int
nfs_iov2pagelist(int rw, const struct inode *inode,
const struct rpc_cred *cred,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs, struct list_head *requests)
{
unsigned seg;
int tot_bytes = 0;
struct page **pages;
/* for each iovec in the array... */
for (seg = 0; seg < nr_segs; seg++) {
const unsigned long user_addr =
(unsigned long) iov[seg].iov_base;
size_t bytes = iov[seg].iov_len;
unsigned int pg_offset = (user_addr & ~PAGE_MASK);
int page_count, page = 0;
page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw);
if (page_count < 0) {
nfs_release_list(requests);
return page_count;
}
/* ...build as many page requests as required */
while (bytes > 0) {
struct nfs_page *new;
const unsigned int pg_bytes = (bytes > PAGE_SIZE) ?
PAGE_SIZE : bytes;
new = nfs_create_request((struct rpc_cred *) cred,
(struct inode *) inode,
pages[page],
pg_offset, pg_bytes);
if (IS_ERR(new)) {
nfs_free_user_pages(pages, page_count);
nfs_release_list(requests);
return PTR_ERR(new);
}
new->wb_index = offset;
nfs_list_add_request(new, requests);
/* after the first page */
pg_offset = 0;
offset += PAGE_SIZE;
tot_bytes += pg_bytes;
bytes -= pg_bytes;
page++;
}
/* don't release pages here -- I/O completion will do that */
nfs_free_user_pages(pages, 0);
}
return tot_bytes;
}
/**
* do_nfs_direct_IO - Read or write data without caching
* @inode: inode of target file
* @cred: credentials of user who requested I/O
* @iov: array of vectors that define I/O buffer
* offset: where in file to begin the read
* nr_segs: size of iovec array
*
* Break the passed-in iovec into a series of page-sized or smaller
* requests, where each page is mapped for direct user-land I/O.
*
* For each of these pages, create an NFS page request and
* append it to an automatic list of page requests.
*
* When all page requests have been queued, start the I/O on the
* whole list. The underlying routines coalesce the pages on the
* list into a bunch of asynchronous "r/wsize" network requests.
*
* I/O completion automatically unmaps and releases the pages.
*/
static int
do_nfs_direct_IO(int rw, const struct inode *inode,
const struct rpc_cred *cred, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
LIST_HEAD(requests);
int result, tot_bytes;
result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs,
&requests);
if (result < 0)
return result;
tot_bytes = result;
switch (rw) {
case READ:
if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) {
result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs);
break;
}
result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages);
nfs_wait_for_reads(&requests);
break;
case WRITE:
if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE))
result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs);
else
result = nfs_flush_list(&requests,
NFS_SERVER(inode)->wpages, FLUSH_WAIT);
/* invalidate cache so non-direct readers pick up changes */
invalidate_inode_pages((struct inode *) inode);
break;
default:
result = -EINVAL;
break;
}
if (result < 0)
return result;
return tot_bytes;
}
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* rw: direction (read or write)
* @file: file struct of target file
* @iov: array of vectors that define I/O buffer
* offset: offset in file to begin the operation
* nr_segs: size of iovec array
*
* The inode's i_sem is no longer held by the VFS layer before it calls
* this function to do a write.
*/
int
nfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
/* None of this works yet, so prevent it from compiling. */
#if 0
int result;
struct dentry *dentry = file->f_dentry;
const struct inode *inode = dentry->d_inode->i_mapping->host;
const struct rpc_cred *cred = nfs_file_cred(file);
#endif
dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n",
((rw == READ) ? "READ" : "WRITE"),
dentry->d_parent->d_name.name,
dentry->d_name.name, offset, nr_segs);
result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs);
dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
return result;
}
...@@ -199,7 +199,10 @@ struct address_space_operations nfs_file_aops = { ...@@ -199,7 +199,10 @@ struct address_space_operations nfs_file_aops = {
.sync_page = nfs_sync_page, .sync_page = nfs_sync_page,
.writepage = nfs_writepage, .writepage = nfs_writepage,
.prepare_write = nfs_prepare_write, .prepare_write = nfs_prepare_write,
.commit_write = nfs_commit_write .commit_write = nfs_commit_write,
#ifdef CONFIG_NFS_DIRECTIO
.direct_IO = nfs_direct_IO,
#endif
}; };
/* /*
......
...@@ -175,6 +175,26 @@ nfs_release_request(struct nfs_page *req) ...@@ -175,6 +175,26 @@ nfs_release_request(struct nfs_page *req)
nfs_page_free(req); nfs_page_free(req);
} }
/**
* nfs_release_list - cleanly dispose of an unattached list of page requests
* @list: list of doomed page requests
*/
void
nfs_release_list(struct list_head *list)
{
while (!list_empty(list)) {
struct nfs_page *req = nfs_list_entry(list);
nfs_list_remove_request(req);
page_cache_release(req->wb_page);
/* Release struct file or cached credential */
nfs_clear_request(req);
nfs_page_free(req);
}
}
/** /**
* nfs_list_add_request - Insert a request into a sorted list * nfs_list_add_request - Insert a request into a sorted list
* @req: request * @req: request
...@@ -223,6 +243,37 @@ nfs_wait_on_request(struct nfs_page *req) ...@@ -223,6 +243,37 @@ nfs_wait_on_request(struct nfs_page *req)
return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req)); return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req));
} }
/**
* nfs_wait_for_reads - wait for outstanding requests to complete
* @head: list of page requests to wait for
*/
int
nfs_wait_for_reads(struct list_head *head)
{
struct list_head *p = head->next;
unsigned int res = 0;
while (p != head) {
struct nfs_page *req = nfs_list_entry(p);
int error;
if (!NFS_WBACK_BUSY(req))
continue;
req->wb_count++;
error = nfs_wait_on_request(req);
if (error < 0)
return error;
nfs_list_remove_request(req);
nfs_clear_request(req);
nfs_page_free(req);
p = head->next;
res++;
}
return res;
}
/** /**
* nfs_coalesce_requests - Split coalesced requests out from a list. * nfs_coalesce_requests - Split coalesced requests out from a list.
* @head: source list * @head: source list
......
...@@ -48,6 +48,7 @@ extern struct nfs_page *nfs_create_request(struct rpc_cred *, struct inode *, ...@@ -48,6 +48,7 @@ extern struct nfs_page *nfs_create_request(struct rpc_cred *, struct inode *,
unsigned int, unsigned int); unsigned int, unsigned int);
extern void nfs_clear_request(struct nfs_page *req); extern void nfs_clear_request(struct nfs_page *req);
extern void nfs_release_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req);
extern void nfs_release_list(struct list_head *list);
extern void nfs_list_add_request(struct nfs_page *, struct list_head *); extern void nfs_list_add_request(struct nfs_page *, struct list_head *);
...@@ -59,6 +60,7 @@ extern int nfs_scan_list(struct list_head *, struct list_head *, ...@@ -59,6 +60,7 @@ extern int nfs_scan_list(struct list_head *, struct list_head *,
extern int nfs_coalesce_requests(struct list_head *, struct list_head *, extern int nfs_coalesce_requests(struct list_head *, struct list_head *,
unsigned int); unsigned int);
extern int nfs_wait_on_request(struct nfs_page *); extern int nfs_wait_on_request(struct nfs_page *);
extern int nfs_wait_for_reads(struct list_head *);
extern spinlock_t nfs_wreq_lock; extern spinlock_t nfs_wreq_lock;
......
#ifndef _LINUX_NFS_XDR_H #ifndef _LINUX_NFS_XDR_H
#define _LINUX_NFS_XDR_H #define _LINUX_NFS_XDR_H
#include <linux/sunrpc/xprt.h>
struct nfs_fattr { struct nfs_fattr {
unsigned short valid; /* which fields are valid */ unsigned short valid; /* which fields are valid */
__u64 pre_size; /* pre_op_attr.size */ __u64 pre_size; /* pre_op_attr.size */
...@@ -57,10 +59,14 @@ struct nfs_fsinfo { ...@@ -57,10 +59,14 @@ struct nfs_fsinfo {
__u32 namelen;/* max name length */ __u32 namelen;/* max name length */
}; };
/* Arguments to the read call. /*
* Note that NFS_READ_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h * Arguments to the read call.
*/ */
#define NFS_READ_MAXIOV 8
#define NFS_READ_MAXIOV (9U)
#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
#error "NFS_READ_MAXIOV is too large"
#endif
struct nfs_readargs { struct nfs_readargs {
struct nfs_fh * fh; struct nfs_fh * fh;
...@@ -76,10 +82,14 @@ struct nfs_readres { ...@@ -76,10 +82,14 @@ struct nfs_readres {
int eof; int eof;
}; };
/* Arguments to the write call. /*
* Note that NFS_WRITE_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h * Arguments to the write call.
*/ */
#define NFS_WRITE_MAXIOV 8 #define NFS_WRITE_MAXIOV (9U)
#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
#error "NFS_WRITE_MAXIOV is too large"
#endif
struct nfs_writeargs { struct nfs_writeargs {
struct nfs_fh * fh; struct nfs_fh * fh;
__u64 offset; __u64 offset;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment