Commit 48751b56 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

Pull overlayfs fixes from Miklos Szeredi:
 "This fixes a regression in the recent file stacking update, reported
  and fixed by Amir Goldstein. The fix is fairly trivial, but involves
  adding a fadvise() f_op and the associated churn in the vfs. As
  discussed on -fsdevel, there are other possible uses for this method,
  than allowing proper stacking for overlays.

  And there's one other fix for a syzkaller detected oops"

* tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs:
  ovl: fix oopses in ovl_fill_super() failure paths
  ovl: add ovl_fadvise()
  vfs: implement readahead(2) using POSIX_FADV_WILLNEED
  vfs: add the fadvise() file operation
  Documentation/filesystems: update documentation of file_operations
  ovl: fix GPF in swapfile_activate of file from overlayfs over xfs
  ovl: respect FIEMAP_FLAG_SYNC flag
parents 4d8d9f54 8c25741a
...@@ -848,7 +848,7 @@ struct file_operations ...@@ -848,7 +848,7 @@ struct file_operations
---------------------- ----------------------
This describes how the VFS can manipulate an open file. As of kernel This describes how the VFS can manipulate an open file. As of kernel
4.1, the following members are defined: 4.18, the following members are defined:
struct file_operations { struct file_operations {
struct module *owner; struct module *owner;
...@@ -858,11 +858,11 @@ struct file_operations { ...@@ -858,11 +858,11 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *); int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *); __poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *); int (*mmap) (struct file *, struct vm_area_struct *);
int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *); int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id); int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *); int (*release) (struct inode *, struct file *);
...@@ -882,6 +882,10 @@ struct file_operations { ...@@ -882,6 +882,10 @@ struct file_operations {
#ifndef CONFIG_MMU #ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *); unsigned (*mmap_capabilities)(struct file *);
#endif #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
int (*fadvise)(struct file *, loff_t, loff_t, int);
}; };
Again, all methods are called without any locks being held, unless Again, all methods are called without any locks being held, unless
...@@ -899,6 +903,9 @@ otherwise noted. ...@@ -899,6 +903,9 @@ otherwise noted.
iterate: called when the VFS needs to read the directory contents iterate: called when the VFS needs to read the directory contents
iterate_shared: called when the VFS needs to read the directory contents
when filesystem supports concurrent dir iterators
poll: called by the VFS when a process wants to check if there is poll: called by the VFS when a process wants to check if there is
activity on this file and (optionally) go to sleep until there activity on this file and (optionally) go to sleep until there
is activity. Called by the select(2) and poll(2) system calls is activity. Called by the select(2) and poll(2) system calls
...@@ -951,6 +958,16 @@ otherwise noted. ...@@ -951,6 +958,16 @@ otherwise noted.
fallocate: called by the VFS to preallocate blocks or punch a hole. fallocate: called by the VFS to preallocate blocks or punch a hole.
copy_file_range: called by the copy_file_range(2) system call.
clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
FICLONE commands.
dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
command.
fadvise: possibly called by the fadvise64() system call.
Note that the file operations are implemented by the specific Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special (character or block special) most filesystems will call special
......
...@@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file) ...@@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file)
if (IS_ERR(realfile)) if (IS_ERR(realfile))
return PTR_ERR(realfile); return PTR_ERR(realfile);
/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
file->f_mapping = realfile->f_mapping;
file->private_data = realfile; file->private_data = realfile;
return 0; return 0;
...@@ -334,6 +331,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len ...@@ -334,6 +331,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
return ret; return ret;
} }
static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
struct fd real;
const struct cred *old_cred;
int ret;
ret = ovl_real_fdget(file, &real);
if (ret)
return ret;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fadvise(real.file, offset, len, advice);
revert_creds(old_cred);
fdput(real);
return ret;
}
static long ovl_real_ioctl(struct file *file, unsigned int cmd, static long ovl_real_ioctl(struct file *file, unsigned int cmd,
unsigned long arg) unsigned long arg)
{ {
...@@ -502,6 +518,7 @@ const struct file_operations ovl_file_operations = { ...@@ -502,6 +518,7 @@ const struct file_operations ovl_file_operations = {
.fsync = ovl_fsync, .fsync = ovl_fsync,
.mmap = ovl_mmap, .mmap = ovl_mmap,
.fallocate = ovl_fallocate, .fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl, .unlocked_ioctl = ovl_ioctl,
.compat_ioctl = ovl_compat_ioctl, .compat_ioctl = ovl_compat_ioctl,
......
...@@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ...@@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -EOPNOTSUPP; return -EOPNOTSUPP;
old_cred = ovl_override_creds(inode->i_sb); old_cred = ovl_override_creds(inode->i_sb);
if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
filemap_write_and_wait(realinode->i_mapping);
err = realinode->i_op->fiemap(realinode, fieinfo, start, len); err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
revert_creds(old_cred); revert_creds(old_cred);
...@@ -500,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = { ...@@ -500,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = {
.update_time = ovl_update_time, .update_time = ovl_update_time,
}; };
const struct address_space_operations ovl_aops = {
/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
.direct_IO = noop_direct_IO,
};
/* /*
* It is possible to stack overlayfs instance on top of another * It is possible to stack overlayfs instance on top of another
* overlayfs instance as lower layer. We need to annonate the * overlayfs instance as lower layer. We need to annonate the
...@@ -571,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, ...@@ -571,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
case S_IFREG: case S_IFREG:
inode->i_op = &ovl_file_inode_operations; inode->i_op = &ovl_file_inode_operations;
inode->i_fop = &ovl_file_operations; inode->i_fop = &ovl_file_operations;
inode->i_mapping->a_ops = &ovl_aops;
break; break;
case S_IFDIR: case S_IFDIR:
......
...@@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) ...@@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
if (err) if (err)
goto out; goto out;
err = -EBUSY;
if (ovl_inuse_trylock(upperpath->dentry)) {
ofs->upperdir_locked = true;
} else if (ofs->config.index) {
pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
goto out;
} else {
pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
upper_mnt = clone_private_mount(upperpath); upper_mnt = clone_private_mount(upperpath);
err = PTR_ERR(upper_mnt); err = PTR_ERR(upper_mnt);
if (IS_ERR(upper_mnt)) { if (IS_ERR(upper_mnt)) {
...@@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) ...@@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
/* Don't inherit atime flags */ /* Don't inherit atime flags */
upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
ofs->upper_mnt = upper_mnt; ofs->upper_mnt = upper_mnt;
err = -EBUSY;
if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
ofs->upperdir_locked = true;
} else if (ofs->config.index) {
pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
goto out;
} else {
pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
err = 0; err = 0;
out: out:
return err; return err;
...@@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) ...@@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
goto out; goto out;
} }
ofs->workbasedir = dget(workpath.dentry);
err = -EBUSY; err = -EBUSY;
if (ovl_inuse_trylock(workpath.dentry)) { if (ovl_inuse_trylock(ofs->workbasedir)) {
ofs->workdir_locked = true; ofs->workdir_locked = true;
} else if (ofs->config.index) { } else if (ofs->config.index) {
pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
...@@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) ...@@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
} }
ofs->workbasedir = dget(workpath.dentry);
err = ovl_make_workdir(ofs, &workpath); err = ovl_make_workdir(ofs, &workpath);
if (err) if (err)
goto out; goto out;
......
...@@ -1763,6 +1763,7 @@ struct file_operations { ...@@ -1763,6 +1763,7 @@ struct file_operations {
u64); u64);
int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
u64); u64);
int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout; } __randomize_layout;
struct inode_operations { struct inode_operations {
...@@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode) ...@@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode)
extern bool path_noexec(const struct path *path); extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode); extern void inode_nohighmem(struct inode *inode);
/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
int advice);
#endif /* _LINUX_FS_H */ #endif /* _LINUX_FS_H */
...@@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH ...@@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o mmu-$(CONFIG_MMU) += process_vm_access.o
endif endif
obj-y := filemap.o mempool.o oom_kill.o \ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \ maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \ util.o mmzone.o vmstat.o backing-dev.o \
...@@ -49,7 +49,6 @@ else ...@@ -49,7 +49,6 @@ else
obj-y += bootmem.o obj-y += bootmem.o
endif endif
obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
ifdef CONFIG_MMU ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif endif
......
...@@ -27,9 +27,9 @@ ...@@ -27,9 +27,9 @@
* deactivate the pages and clear PG_Referenced. * deactivate the pages and clear PG_Referenced.
*/ */
int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
int advice)
{ {
struct fd f = fdget(fd);
struct inode *inode; struct inode *inode;
struct address_space *mapping; struct address_space *mapping;
struct backing_dev_info *bdi; struct backing_dev_info *bdi;
...@@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) ...@@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
pgoff_t start_index; pgoff_t start_index;
pgoff_t end_index; pgoff_t end_index;
unsigned long nrpages; unsigned long nrpages;
int ret = 0;
if (!f.file)
return -EBADF;
inode = file_inode(f.file); inode = file_inode(file);
if (S_ISFIFO(inode->i_mode)) { if (S_ISFIFO(inode->i_mode))
ret = -ESPIPE; return -ESPIPE;
goto out;
}
mapping = f.file->f_mapping; mapping = file->f_mapping;
if (!mapping || len < 0) { if (!mapping || len < 0)
ret = -EINVAL; return -EINVAL;
goto out;
}
bdi = inode_to_bdi(mapping->host); bdi = inode_to_bdi(mapping->host);
...@@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) ...@@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
/* no bad return value, but ignore advice */ /* no bad return value, but ignore advice */
break; break;
default: default:
ret = -EINVAL; return -EINVAL;
} }
goto out; return 0;
} }
/* /*
...@@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) ...@@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
switch (advice) { switch (advice) {
case POSIX_FADV_NORMAL: case POSIX_FADV_NORMAL:
f.file->f_ra.ra_pages = bdi->ra_pages; file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&f.file->f_lock); spin_lock(&file->f_lock);
f.file->f_mode &= ~FMODE_RANDOM; file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&f.file->f_lock); spin_unlock(&file->f_lock);
break; break;
case POSIX_FADV_RANDOM: case POSIX_FADV_RANDOM:
spin_lock(&f.file->f_lock); spin_lock(&file->f_lock);
f.file->f_mode |= FMODE_RANDOM; file->f_mode |= FMODE_RANDOM;
spin_unlock(&f.file->f_lock); spin_unlock(&file->f_lock);
break; break;
case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_SEQUENTIAL:
f.file->f_ra.ra_pages = bdi->ra_pages * 2; file->f_ra.ra_pages = bdi->ra_pages * 2;
spin_lock(&f.file->f_lock); spin_lock(&file->f_lock);
f.file->f_mode &= ~FMODE_RANDOM; file->f_mode &= ~FMODE_RANDOM;
spin_unlock(&f.file->f_lock); spin_unlock(&file->f_lock);
break; break;
case POSIX_FADV_WILLNEED: case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */ /* First and last PARTIAL page! */
...@@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) ...@@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
* Ignore return value because fadvise() shall return * Ignore return value because fadvise() shall return
* success even if filesystem can't retrieve a hint, * success even if filesystem can't retrieve a hint,
*/ */
force_page_cache_readahead(mapping, f.file, start_index, force_page_cache_readahead(mapping, file, start_index, nrpages);
nrpages);
break; break;
case POSIX_FADV_NOREUSE: case POSIX_FADV_NOREUSE:
break; break;
...@@ -183,9 +174,32 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) ...@@ -183,9 +174,32 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
} }
break; break;
default: default:
ret = -EINVAL; return -EINVAL;
} }
out: return 0;
}
int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
if (file->f_op->fadvise)
return file->f_op->fadvise(file, offset, len, advice);
return generic_fadvise(file, offset, len, advice);
}
EXPORT_SYMBOL(vfs_fadvise);
#ifdef CONFIG_ADVISE_SYSCALLS
int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
struct fd f = fdget(fd);
int ret;
if (!f.file)
return -EBADF;
ret = vfs_fadvise(f.file, offset, len, advice);
fdput(f); fdput(f);
return ret; return ret;
} }
...@@ -203,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) ...@@ -203,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
} }
#endif #endif
#endif
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include "internal.h" #include "internal.h"
...@@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping, ...@@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping,
} }
EXPORT_SYMBOL_GPL(page_cache_async_readahead); EXPORT_SYMBOL_GPL(page_cache_async_readahead);
static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
if (!mapping || !mapping->a_ops)
return -EINVAL;
/*
* Readahead doesn't make sense for DAX inodes, but we don't want it
* to report a failure either. Instead, we just return success and
* don't do any work.
*/
if (dax_mapping(mapping))
return 0;
return force_page_cache_readahead(mapping, filp, index, nr);
}
ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{ {
ssize_t ret; ssize_t ret;
...@@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ...@@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
ret = -EBADF; ret = -EBADF;
f = fdget(fd); f = fdget(fd);
if (f.file) { if (!f.file || !(f.file->f_mode & FMODE_READ))
if (f.file->f_mode & FMODE_READ) { goto out;
struct address_space *mapping = f.file->f_mapping;
pgoff_t start = offset >> PAGE_SHIFT; /*
pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; * The readahead() syscall is intended to run only on files
unsigned long len = end - start + 1; * that can execute readahead. If readahead is not possible
ret = do_readahead(mapping, f.file, start, len); * on this file, then we must return -EINVAL.
} */
fdput(f); ret = -EINVAL;
} if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
!S_ISREG(file_inode(f.file)->i_mode))
goto out;
ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
fdput(f);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment