Commit c03e7607 authored by Chuck Lever's avatar Chuck Lever Committed by Linus Torvalds

[PATCH] initial support for NFS direct I/O for 2.5

This adds initial support for NFS direct I/O in the 2.5 kernel.  many
have asked for this support to be included in 2.5.  this patch does not
provide working NFS direct I/O, but i'm sending what i have now so that
it can be included before October 20.

NFS direct I/O is enabled by its very own kernel config option.  when
enabled, the NFS client won't build to prevent people from using this and
possibly corrupting their NFS files.  later i will send a patch that
finishes the implementation.

[ Config option currently disabled ]
parent eb582eba
......@@ -514,6 +514,27 @@ CONFIG_NFS_V3
If unsure, say N.
CONFIG_NFS_DIRECTIO
This option enables applications to perform uncached I/O on files
in NFS file systems using the O_DIRECT open() flag. When O_DIRECT
is set for a file, its data is not cached in the system's page
cache. Data is moved to and from user-level application buffers
directly. Unlike local disk-based file systems, NFS O_DIRECT has
no alignment restrictions.
Unless your program is designed to use O_DIRECT properly, you are
much better off allowing the NFS client to manage data caching for
you. Misusing O_DIRECT can cause poor server performance or network
storms. This kernel build option defaults OFF to avoid exposing
system administrators unwittingly to a potentially hazardous
feature.
For more details on NFS O_DIRECT, see fs/nfs/direct.c.
If unsure, say N. This reduces the size of the NFS client, and
causes open() to return EINVAL if a file residing in NFS is
opened with the O_DIRECT flag.
CONFIG_ROOT_NFS
If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the
......
......@@ -8,6 +8,7 @@ nfs-y := dir.o file.o flushd.o inode.o nfs2xdr.o pagelist.o \
proc.o read.o symlink.o unlink.o write.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
nfs-objs := $(nfs-y)
include $(TOPDIR)/Rules.make
/*
* linux/fs/nfs/direct.c
*
* Copyright (C) 2001 by Chuck Lever <cel@netapp.com>
*
* High-performance uncached I/O for the Linux NFS client
*
* There are important applications whose performance or correctness
* depends on uncached access to file data. Database clusters
* (multiple copies of the same instance running on separate hosts)
* implement their own cache coherency protocol that subsumes file
* system cache protocols. Applications that process datasets
* considerably larger than the client's memory do not always benefit
* from a local cache. A streaming video server, for instance, has no
* need to cache the contents of a file.
*
* When an application requests uncached I/O, all read and write requests
* are made directly to the server; data stored or fetched via these
* requests is not cached in the Linux page cache. The client does not
* correct unaligned requests from applications. All requested bytes are
* held on permanent storage before a direct write system call returns to
* an application.
*
* Solaris implements an uncached I/O facility called directio() that
* is used for backups and sequential I/O to very large files. Solaris
* also supports uncaching whole NFS partitions with "-o forcedirectio,"
* an undocumented mount option.
*
* Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
*
* 18 Dec 2001 Initial implementation for 2.4 --cel
* 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
* 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel
*
*/
#include <linux/config.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/errno.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS)
#define VERF_SIZE (2 * sizeof(__u32))
/**
* nfs_get_user_pages - find and set up page representing user buffer
* addr: user-space address of target buffer
* size: total size in bytes of target buffer
* @pages: returned array of page struct pointers underlying target buffer
* write: whether or not buffer is target of a write operation
*/
static inline int
nfs_get_user_pages(unsigned long addr, size_t size,
struct page ***pages, int rw)
{
int result = -ENOMEM;
unsigned page_count = (unsigned) size >> PAGE_SHIFT;
unsigned array_size = (page_count * sizeof(struct page *)) + 2U;
*pages = (struct page **) kmalloc(array_size, GFP_KERNEL);
if (*pages) {
down_read(&current->mm->mmap_sem);
result = get_user_pages(current, current->mm, addr,
page_count, (rw == WRITE), 0,
*pages, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0)
printk(KERN_ERR "%s: get_user_pages result %d\n",
__FUNCTION__, result);
}
return result;
}
/**
* nfs_free_user_pages - tear down page struct array
* @pages: array of page struct pointers underlying target buffer
*/
static inline void
nfs_free_user_pages(struct page **pages, unsigned count)
{
unsigned page = 0;
while (count--)
page_cache_release(pages[page++]);
kfree(pages);
}
/**
* nfs_iov2pagelist - convert an array of iovecs to a list of page requests
* @inode: inode of target file
* @cred: credentials of user who requested I/O
* @iov: array of vectors that define I/O buffer
* offset: where in file to begin the read
* nr_segs: size of iovec array
* @requests: append new page requests to this list head
*/
static int
nfs_iov2pagelist(int rw, const struct inode *inode,
const struct rpc_cred *cred,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs, struct list_head *requests)
{
unsigned seg;
int tot_bytes = 0;
struct page **pages;
/* for each iovec in the array... */
for (seg = 0; seg < nr_segs; seg++) {
const unsigned long user_addr =
(unsigned long) iov[seg].iov_base;
size_t bytes = iov[seg].iov_len;
unsigned int pg_offset = (user_addr & ~PAGE_MASK);
int page_count, page = 0;
page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw);
if (page_count < 0) {
nfs_release_list(requests);
return page_count;
}
/* ...build as many page requests as required */
while (bytes > 0) {
struct nfs_page *new;
const unsigned int pg_bytes = (bytes > PAGE_SIZE) ?
PAGE_SIZE : bytes;
new = nfs_create_request((struct rpc_cred *) cred,
(struct inode *) inode,
pages[page],
pg_offset, pg_bytes);
if (IS_ERR(new)) {
nfs_free_user_pages(pages, page_count);
nfs_release_list(requests);
return PTR_ERR(new);
}
new->wb_index = offset;
nfs_list_add_request(new, requests);
/* after the first page */
pg_offset = 0;
offset += PAGE_SIZE;
tot_bytes += pg_bytes;
bytes -= pg_bytes;
page++;
}
/* don't release pages here -- I/O completion will do that */
nfs_free_user_pages(pages, 0);
}
return tot_bytes;
}
/**
* do_nfs_direct_IO - Read or write data without caching
* @inode: inode of target file
* @cred: credentials of user who requested I/O
* @iov: array of vectors that define I/O buffer
* offset: where in file to begin the read
* nr_segs: size of iovec array
*
* Break the passed-in iovec into a series of page-sized or smaller
* requests, where each page is mapped for direct user-land I/O.
*
* For each of these pages, create an NFS page request and
* append it to an automatic list of page requests.
*
* When all page requests have been queued, start the I/O on the
* whole list. The underlying routines coalesce the pages on the
* list into a bunch of asynchronous "r/wsize" network requests.
*
* I/O completion automatically unmaps and releases the pages.
*/
static int
do_nfs_direct_IO(int rw, const struct inode *inode,
const struct rpc_cred *cred, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
LIST_HEAD(requests);
int result, tot_bytes;
result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs,
&requests);
if (result < 0)
return result;
tot_bytes = result;
switch (rw) {
case READ:
if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) {
result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs);
break;
}
result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages);
nfs_wait_for_reads(&requests);
break;
case WRITE:
if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE))
result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs);
else
result = nfs_flush_list(&requests,
NFS_SERVER(inode)->wpages, FLUSH_WAIT);
/* invalidate cache so non-direct readers pick up changes */
invalidate_inode_pages((struct inode *) inode);
break;
default:
result = -EINVAL;
break;
}
if (result < 0)
return result;
return tot_bytes;
}
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* rw: direction (read or write)
* @file: file struct of target file
* @iov: array of vectors that define I/O buffer
* offset: offset in file to begin the operation
* nr_segs: size of iovec array
*
* The inode's i_sem is no longer held by the VFS layer before it calls
* this function to do a write.
*/
int
nfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
/* None of this works yet, so prevent it from compiling. */
#if 0
int result;
struct dentry *dentry = file->f_dentry;
const struct inode *inode = dentry->d_inode->i_mapping->host;
const struct rpc_cred *cred = nfs_file_cred(file);
#endif
dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n",
((rw == READ) ? "READ" : "WRITE"),
dentry->d_parent->d_name.name,
dentry->d_name.name, offset, nr_segs);
result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs);
dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
return result;
}
......@@ -199,7 +199,10 @@ struct address_space_operations nfs_file_aops = {
.sync_page = nfs_sync_page,
.writepage = nfs_writepage,
.prepare_write = nfs_prepare_write,
.commit_write = nfs_commit_write
.commit_write = nfs_commit_write,
#ifdef CONFIG_NFS_DIRECTIO
.direct_IO = nfs_direct_IO,
#endif
};
/*
......
......@@ -175,6 +175,26 @@ nfs_release_request(struct nfs_page *req)
nfs_page_free(req);
}
/**
* nfs_release_list - cleanly dispose of an unattached list of page requests
* @list: list of doomed page requests
*/
void
nfs_release_list(struct list_head *list)
{
while (!list_empty(list)) {
struct nfs_page *req = nfs_list_entry(list);
nfs_list_remove_request(req);
page_cache_release(req->wb_page);
/* Release struct file or cached credential */
nfs_clear_request(req);
nfs_page_free(req);
}
}
/**
* nfs_list_add_request - Insert a request into a sorted list
* @req: request
......@@ -223,6 +243,37 @@ nfs_wait_on_request(struct nfs_page *req)
return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req));
}
/**
* nfs_wait_for_reads - wait for outstanding requests to complete
* @head: list of page requests to wait for
*/
int
nfs_wait_for_reads(struct list_head *head)
{
struct list_head *p = head->next;
unsigned int res = 0;
while (p != head) {
struct nfs_page *req = nfs_list_entry(p);
int error;
if (!NFS_WBACK_BUSY(req))
continue;
req->wb_count++;
error = nfs_wait_on_request(req);
if (error < 0)
return error;
nfs_list_remove_request(req);
nfs_clear_request(req);
nfs_page_free(req);
p = head->next;
res++;
}
return res;
}
/**
* nfs_coalesce_requests - Split coalesced requests out from a list.
* @head: source list
......
......@@ -48,6 +48,7 @@ extern struct nfs_page *nfs_create_request(struct rpc_cred *, struct inode *,
unsigned int, unsigned int);
extern void nfs_clear_request(struct nfs_page *req);
extern void nfs_release_request(struct nfs_page *req);
extern void nfs_release_list(struct list_head *list);
extern void nfs_list_add_request(struct nfs_page *, struct list_head *);
......@@ -59,6 +60,7 @@ extern int nfs_scan_list(struct list_head *, struct list_head *,
extern int nfs_coalesce_requests(struct list_head *, struct list_head *,
unsigned int);
extern int nfs_wait_on_request(struct nfs_page *);
extern int nfs_wait_for_reads(struct list_head *);
extern spinlock_t nfs_wreq_lock;
......
#ifndef _LINUX_NFS_XDR_H
#define _LINUX_NFS_XDR_H
#include <linux/sunrpc/xprt.h>
struct nfs_fattr {
unsigned short valid; /* which fields are valid */
__u64 pre_size; /* pre_op_attr.size */
......@@ -57,10 +59,14 @@ struct nfs_fsinfo {
__u32 namelen;/* max name length */
};
/* Arguments to the read call.
* Note that NFS_READ_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h
/*
* Arguments to the read call.
*/
#define NFS_READ_MAXIOV 8
#define NFS_READ_MAXIOV (9U)
#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
#error "NFS_READ_MAXIOV is too large"
#endif
struct nfs_readargs {
struct nfs_fh * fh;
......@@ -76,10 +82,14 @@ struct nfs_readres {
int eof;
};
/* Arguments to the write call.
* Note that NFS_WRITE_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h
/*
* Arguments to the write call.
*/
#define NFS_WRITE_MAXIOV 8
#define NFS_WRITE_MAXIOV (9U)
#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
#error "NFS_WRITE_MAXIOV is too large"
#endif
struct nfs_writeargs {
struct nfs_fh * fh;
__u64 offset;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment