Commit fccbe384 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] implement posix_fadvise64()

An implementation of posix_fadvise64().  It adds 368 bytes to my vmlinux and
is worth it.

I didn't bother doing posix_fadvise(), as userspace can implement that by
calling fadvise64().

The main reason for wanting this syscall is to provide userspace with the
ability to explicitly shoot down pagecache when streaming large files.  This
is what O_STEAMING does, only posix_fadvise() is standards-based, and harder
to use.

posix_fadvise() also subsumes sys_readahead().

POSIX_FADV_WILLNEED will generally provide asynchronous readahead semantics
for small amounts of I/O.  As long as things like indirect blocks are aready
in core.

POSIX_FADV_RANDOM gives unprivileged applications a way of disabling
readahead on a per-fd basis, which may provide some benefit for super-seeky
access patterns such as databases.



The POSIX_FADV_* values are already implemented in glibc, and this patch
ensures that they are in sync.

A test app (fadvise.c) is available in ext3 CVS.  See

	http://www.zip.com.au/~akpm/linux/ext3/

for CVS details.

Ulrich has reviewed this patch (thanks).
parent e7bfb1db
...@@ -792,8 +792,8 @@ ENTRY(sys_call_table) ...@@ -792,8 +792,8 @@ ENTRY(sys_call_table)
.long sys_io_getevents .long sys_io_getevents
.long sys_io_submit .long sys_io_submit
.long sys_io_cancel .long sys_io_cancel
.long sys_ni_syscall /* 250 sys_alloc_hugepages - reuse this */ .long sys_fadvise64 /* 250 */
.long sys_ni_syscall /* was sys_free_hugepages - reuse this */ .long sys_ni_syscall
.long sys_exit_group .long sys_exit_group
.long sys_lookup_dcookie .long sys_lookup_dcookie
.long sys_epoll_create .long sys_epoll_create
......
...@@ -255,6 +255,8 @@ ...@@ -255,6 +255,8 @@
#define __NR_io_getevents 247 #define __NR_io_getevents 247
#define __NR_io_submit 248 #define __NR_io_submit 248
#define __NR_io_cancel 249 #define __NR_io_cancel 249
#define __NR_fadvise64 250
#define __NR_exit_group 252 #define __NR_exit_group 252
#define __NR_lookup_dcookie 253 #define __NR_lookup_dcookie 253
#define __NR_epoll_create 254 #define __NR_epoll_create 254
......
#ifndef FADVISE_H_INCLUDED
#define FADVISE_H_INCLUDED
#define POSIX_FADV_NORMAL 0 /* No further special treatment. */
#define POSIX_FADV_RANDOM 1 /* Expect random page references. */
#define POSIX_FADV_SEQUENTIAL 2 /* Expect sequential page references. */
#define POSIX_FADV_WILLNEED 3 /* Will need these pages. */
#define POSIX_FADV_DONTNEED 4 /* Don't need these pages. */
#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
#endif /* FADVISE_H_INCLUDED */
...@@ -1102,6 +1102,8 @@ extern int full_check_disk_change(struct block_device *); ...@@ -1102,6 +1102,8 @@ extern int full_check_disk_change(struct block_device *);
extern int __check_disk_change(dev_t); extern int __check_disk_change(dev_t);
extern int invalidate_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int); extern int invalidate_device(kdev_t, int);
extern void invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern void invalidate_inode_pages(struct address_space *mapping); extern void invalidate_inode_pages(struct address_space *mapping);
extern void invalidate_inode_pages2(struct address_space *mapping); extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int); extern void write_inode_now(struct inode *, int);
......
...@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ ...@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
shmem.o vmalloc.o shmem.o vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
......
/*
* mm/fadvise.c
*
* Copyright (C) 2002, Linus Torvalds
*
* 11Jan2003 akpm@digeo.com
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
int sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
{
struct file *file = fget(fd);
struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
int ret = 0;
if (!file)
return -EBADF;
inode = file->f_dentry->d_inode;
mapping = inode->i_mapping;
if (!mapping)
return -EINVAL;
bdi = mapping->backing_dev_info;
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
break;
case POSIX_FADV_RANDOM:
file->f_ra.ra_pages = 0;
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
break;
case POSIX_FADV_WILLNEED:
case POSIX_FADV_NOREUSE:
if (!mapping->a_ops->readpage) {
ret = -EINVAL;
break;
}
ret = do_page_cache_readahead(mapping, file,
offset >> PAGE_CACHE_SHIFT,
max_sane_readahead(len >> PAGE_CACHE_SHIFT));
if (ret > 0)
ret = 0;
break;
case POSIX_FADV_DONTNEED:
invalidate_mapping_pages(mapping, offset >> PAGE_CACHE_SHIFT,
(len >> PAGE_CACHE_SHIFT) + 1);
break;
default:
ret = -EINVAL;
}
fput(file);
return ret;
}
...@@ -177,24 +177,28 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) ...@@ -177,24 +177,28 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
} }
/** /**
* invalidate_inode_pages - Invalidate all the unlocked pages of one inode * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @inode: the inode which pages we want to invalidate * @inode: the address_space which holds the pages to invalidate
* @end: the index of the last page to invalidate (inclusive)
* @nr_pages: defines the pagecache span. Invalidate up to @start + @nr_pages
* *
* This function only removes the unlocked pages, if you want to * This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages. * remove all the pages of one inode, you must call truncate_inode_pages.
* *
* invalidate_inode_pages() will not block on IO activity. It will not * invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into * invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables. * pagetables.
*/ */
void invalidate_inode_pages(struct address_space *mapping) void invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{ {
struct pagevec pvec; struct pagevec pvec;
pgoff_t next = 0; pgoff_t next = start;
int i; int i;
pagevec_init(&pvec, 0); pagevec_init(&pvec, 0);
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) { for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i]; struct page *page = pvec.pages[i];
...@@ -218,6 +222,11 @@ void invalidate_inode_pages(struct address_space *mapping) ...@@ -218,6 +222,11 @@ void invalidate_inode_pages(struct address_space *mapping)
} }
} }
void invalidate_inode_pages(struct address_space *mapping)
{
invalidate_mapping_pages(mapping, 0, ~0UL);
}
/** /**
* invalidate_inode_pages2 - remove all unmapped pages from an address_space * invalidate_inode_pages2 - remove all unmapped pages from an address_space
* @mapping - the address_space * @mapping - the address_space
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment