Commit 5274f052 authored by Jens Axboe's avatar Jens Axboe Committed by Linus Torvalds

[PATCH] Introduce sys_splice() system call

This adds support for the sys_splice system call. Using a pipe as a
transport, it can connect to files or sockets (latter as output only).

From the splice.c comments:

   "splice": joining two ropes together by interweaving their strands.

   This is the "extended pipe" functionality, where a pipe is used as
   an arbitrary in-memory buffer. Think of a pipe as a small kernel
   buffer that you can use to transfer data from one end to the other.

   The traditional unix read/write is extended with a "splice()" operation
   that transfers data buffers to or from a pipe buffer.

   Named by Larry McVoy, original implementation from Linus, extended by
   Jens to support splicing to files and fixing the initial implementation
   bugs.
Signed-off-by: default avatarJens Axboe <axboe@suse.de>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 5d4fe2c1
...@@ -312,3 +312,4 @@ ENTRY(sys_call_table) ...@@ -312,3 +312,4 @@ ENTRY(sys_call_table)
.long sys_unshare /* 310 */ .long sys_unshare /* 310 */
.long sys_set_robust_list .long sys_set_robust_list
.long sys_get_robust_list .long sys_get_robust_list
.long sys_splice
...@@ -1605,5 +1605,6 @@ sys_call_table: ...@@ -1605,5 +1605,6 @@ sys_call_table:
data8 sys_ni_syscall // reserved for pselect data8 sys_ni_syscall // reserved for pselect
data8 sys_ni_syscall // 1295 reserved for ppoll data8 sys_ni_syscall // 1295 reserved for ppoll
data8 sys_unshare data8 sys_unshare
data8 sys_splice
.org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
...@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ...@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
ioprio.o pnode.o drop_caches.o ioprio.o pnode.o drop_caches.o splice.o
obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_EPOLL) += eventpoll.o
......
...@@ -53,6 +53,8 @@ const struct file_operations ext2_file_operations = { ...@@ -53,6 +53,8 @@ const struct file_operations ext2_file_operations = {
.readv = generic_file_readv, .readv = generic_file_readv,
.writev = generic_file_writev, .writev = generic_file_writev,
.sendfile = generic_file_sendfile, .sendfile = generic_file_sendfile,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
}; };
#ifdef CONFIG_EXT2_FS_XIP #ifdef CONFIG_EXT2_FS_XIP
......
...@@ -119,6 +119,8 @@ const struct file_operations ext3_file_operations = { ...@@ -119,6 +119,8 @@ const struct file_operations ext3_file_operations = {
.release = ext3_release_file, .release = ext3_release_file,
.fsync = ext3_sync_file, .fsync = ext3_sync_file,
.sendfile = generic_file_sendfile, .sendfile = generic_file_sendfile,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
}; };
struct inode_operations ext3_file_inode_operations = { struct inode_operations ext3_file_inode_operations = {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/pipe_fs_i.h> #include <linux/pipe_fs_i.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pagemap.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/ioctls.h> #include <asm/ioctls.h>
...@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff ...@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
{ {
struct page *page = buf->page; struct page *page = buf->page;
if (info->tmp_page) { /*
__free_page(page); * If nobody else uses this page, and we don't already have a
* temporary page, let's keep track of it as a one-deep
* allocation cache
*/
if (page_count(page) == 1 && !info->tmp_page) {
info->tmp_page = page;
return; return;
} }
info->tmp_page = page;
/*
* Otherwise just release our reference to it
*/
page_cache_release(page);
} }
static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
...@@ -152,6 +162,11 @@ pipe_readv(struct file *filp, const struct iovec *_iov, ...@@ -152,6 +162,11 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
chars = total_len; chars = total_len;
addr = ops->map(filp, info, buf); addr = ops->map(filp, info, buf);
if (IS_ERR(addr)) {
if (!ret)
ret = PTR_ERR(addr);
break;
}
error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
ops->unmap(info, buf); ops->unmap(info, buf);
if (unlikely(error)) { if (unlikely(error)) {
...@@ -254,8 +269,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ...@@ -254,8 +269,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
struct pipe_buf_operations *ops = buf->ops; struct pipe_buf_operations *ops = buf->ops;
int offset = buf->offset + buf->len; int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) { if (ops->can_merge && offset + chars <= PAGE_SIZE) {
void *addr = ops->map(filp, info, buf); void *addr;
int error = pipe_iov_copy_from_user(offset + addr, iov, chars); int error;
addr = ops->map(filp, info, buf);
if (IS_ERR(addr)) {
error = PTR_ERR(addr);
goto out;
}
error = pipe_iov_copy_from_user(offset + addr, iov,
chars);
ops->unmap(info, buf); ops->unmap(info, buf);
ret = error; ret = error;
do_wakeup = 1; do_wakeup = 1;
......
...@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_file_operations = { ...@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_file_operations = {
.sendfile = generic_file_sendfile, .sendfile = generic_file_sendfile,
.aio_read = generic_file_aio_read, .aio_read = generic_file_aio_read,
.aio_write = reiserfs_aio_write, .aio_write = reiserfs_aio_write,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
}; };
struct inode_operations reiserfs_file_inode_operations = { struct inode_operations reiserfs_file_inode_operations = {
......
This diff is collapsed.
...@@ -318,8 +318,9 @@ ...@@ -318,8 +318,9 @@
#define __NR_unshare 310 #define __NR_unshare 310
#define __NR_set_robust_list 311 #define __NR_set_robust_list 311
#define __NR_get_robust_list 312 #define __NR_get_robust_list 312
#define __NR_sys_splice 313
#define NR_syscalls 313 #define NR_syscalls 314
/* /*
* user-visible error numbers are in the range -1 - -128: see * user-visible error numbers are in the range -1 - -128: see
......
...@@ -285,12 +285,13 @@ ...@@ -285,12 +285,13 @@
#define __NR_faccessat 1293 #define __NR_faccessat 1293
/* 1294, 1295 reserved for pselect/ppoll */ /* 1294, 1295 reserved for pselect/ppoll */
#define __NR_unshare 1296 #define __NR_unshare 1296
#define __NR_splice 1297
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/config.h> #include <linux/config.h>
#define NR_syscalls 273 /* length of syscall table */ #define NR_syscalls 274 /* length of syscall table */
#define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGACTION
......
...@@ -301,8 +301,9 @@ ...@@ -301,8 +301,9 @@
#define __NR_pselect6 280 #define __NR_pselect6 280
#define __NR_ppoll 281 #define __NR_ppoll 281
#define __NR_unshare 282 #define __NR_unshare 282
#define __NR_splice 283
#define __NR_syscalls 283 #define __NR_syscalls 284
#ifdef __KERNEL__ #ifdef __KERNEL__
#define __NR__exit __NR_exit #define __NR__exit __NR_exit
......
...@@ -609,8 +609,10 @@ __SYSCALL(__NR_unshare, sys_unshare) ...@@ -609,8 +609,10 @@ __SYSCALL(__NR_unshare, sys_unshare)
__SYSCALL(__NR_set_robust_list, sys_set_robust_list) __SYSCALL(__NR_set_robust_list, sys_set_robust_list)
#define __NR_get_robust_list 274 #define __NR_get_robust_list 274
__SYSCALL(__NR_get_robust_list, sys_get_robust_list) __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
#define __NR_splice 275
__SYSCALL(__NR_splice, sys_splice)
#define __NR_syscall_max __NR_get_robust_list #define __NR_syscall_max __NR_splice
#ifndef __NO_STUBS #ifndef __NO_STUBS
......
...@@ -1032,6 +1032,8 @@ struct file_operations { ...@@ -1032,6 +1032,8 @@ struct file_operations {
int (*check_flags)(int); int (*check_flags)(int);
int (*dir_notify)(struct file *filp, unsigned long arg); int (*dir_notify)(struct file *filp, unsigned long arg);
int (*flock) (struct file *, int, struct file_lock *); int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
}; };
struct inode_operations { struct inode_operations {
...@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor ...@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor
extern void do_generic_mapping_read(struct address_space *mapping, extern void do_generic_mapping_read(struct address_space *mapping,
struct file_ra_state *, struct file *, struct file_ra_state *, struct file *,
loff_t *, read_descriptor_t *, read_actor_t); loff_t *, read_descriptor_t *, read_actor_t);
extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
extern void extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
......
...@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, ...@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
int flags, int mode); int flags, int mode);
asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_unshare(unsigned long unshare_flags);
asmlinkage long sys_splice(int fdin, int fdout, size_t len,
unsigned int flags);
#endif #endif
...@@ -119,6 +119,9 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector, ...@@ -119,6 +119,9 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector,
static ssize_t sock_sendpage(struct file *file, struct page *page, static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more); int offset, size_t size, loff_t *ppos, int more);
extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
size_t len, unsigned int flags);
/* /*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
...@@ -141,7 +144,8 @@ static struct file_operations socket_file_ops = { ...@@ -141,7 +144,8 @@ static struct file_operations socket_file_ops = {
.fasync = sock_fasync, .fasync = sock_fasync,
.readv = sock_readv, .readv = sock_readv,
.writev = sock_writev, .writev = sock_writev,
.sendpage = sock_sendpage .sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
}; };
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment