Commit 70499656 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-4.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "User visible features:

   - added support for the ioctl FS_IOC_FSGETXATTR, per-inode flags,
     successor of GET/SETFLAGS; now supports only existing flags:
     append, immutable, noatime, nodump, sync

   - 3 new unprivileged ioctls to allow users to enumerate subvolumes

   - dedupe syscall implementation does not restrict the range to 16MiB,
     though it still splits the whole range to 16MiB chunks

   - on user demand, rmdir() is able to delete an empty subvolume,
     export the capability in sysfs

   - fix inode number types in tracepoints, other cleanups

   - send: improved speed when dealing with a large removed directory,
     measurements show decrease from 2000 minutes to 2 minutes on a
     directory with 2 million entries

   - pre-commit check of superblock to detect a mysterious in-memory
     corruption

   - log message updates

  Other changes:

   - orphan inode cleanup improved, does no keep long-standing
     reservations that could lead up to early ENOSPC in some cases

   - slight improvement of handling snapshotted NOCOW files by avoiding
     some unnecessary tree searches

   - avoid OOM when dealing with many unmergeable small extents at flush
     time

   - speedup conversion of free space tree representations from/to
     bitmap/tree

   - code refactoring, deletion, cleanups:
      + delayed refs
      + delayed iput
      + redundant argument removals
      + memory barrier cleanups
      + remove a redundant mutex supposedly excluding several ioctls to
        run in parallel

   - new tracepoints for blockgroup manipulation

   - more sanity checks of compressed headers"

* tag 'for-4.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (183 commits)
  btrfs: Add unprivileged version of ino_lookup ioctl
  btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF
  btrfs: Add unprivileged ioctl which returns subvolume information
  Btrfs: clean up error handling in btrfs_truncate()
  btrfs: Factor out write portion of btrfs_get_blocks_direct
  btrfs: Factor out read portion of btrfs_get_blocks_direct
  btrfs: return ENOMEM if path allocation fails in btrfs_cross_ref_exist
  btrfs: raid56: Remove VLA usage
  btrfs: return error value if create_io_em failed in cow_file_range
  btrfs: drop useless member qgroup_reserved of btrfs_pending_snapshot
  btrfs: drop unused parameter qgroup_reserved
  btrfs: balance dirty metadata pages in btrfs_finish_ordered_io
  btrfs: lift some btrfs_cross_ref_exist checks in nocow path
  btrfs: Remove fs_info argument from btrfs_uuid_tree_rem
  btrfs: Remove fs_info argument from btrfs_uuid_tree_add
  Btrfs: remove unused check of skip_locking
  Btrfs: remove always true check in unlock_up
  Btrfs: grab write lock directly if write_lock_level is the max level
  Btrfs: move get root out of btrfs_search_slot to a helper
  Btrfs: use more straightforward extent_buffer_uptodate check
  ...
parents e3a44fd7 23d0b79d
......@@ -19,17 +19,17 @@
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
*/
#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
#define BTRFS_INODE_ORPHAN_META_RESERVED 1
#define BTRFS_INODE_DUMMY 2
#define BTRFS_INODE_IN_DEFRAG 3
#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
#define BTRFS_INODE_NEEDS_FULL_SYNC 6
#define BTRFS_INODE_COPY_EVERYTHING 7
#define BTRFS_INODE_IN_DELALLOC_LIST 8
#define BTRFS_INODE_READDIO_NEED_LOCK 9
#define BTRFS_INODE_HAS_PROPS 10
enum {
BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
};
/* in memory btrfs inode */
struct btrfs_inode {
......
......@@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace,
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(total_ws);
wake:
/*
* Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(ws_wait))
wake_up(ws_wait);
cond_wake_up(ws_wait);
}
static void free_workspace(int type, struct list_head *ws)
......
......@@ -6,6 +6,8 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
#include <linux/sizes.h>
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
......
......@@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
no_skips = 1;
t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
if (i >= lowest_unlock && i > skip_level) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
......@@ -2432,7 +2432,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
free_extent_buffer(tmp);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
......@@ -2446,7 +2445,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
if (!btrfs_buffer_uptodate(tmp, 0, 0))
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
} else {
......@@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
return 0;
}
static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int root_lock;
int level = 0;
/* We try very hard to do read locks on the root */
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
/* The commit roots are read only so we always do read locks */
if (p->need_commit_sem)
down_read(&fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
if (p->need_commit_sem)
up_read(&fs_info->commit_root_sem);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
*/
ASSERT(p->skip_locking == 1);
goto out;
}
if (p->skip_locking) {
b = btrfs_root_node(root);
level = btrfs_header_level(b);
goto out;
}
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
*/
if (write_lock_level < BTRFS_MAX_LEVEL) {
/*
* We don't know the level of the root node until we actually
* have it read locked
*/
b = btrfs_read_lock_root_node(root);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
/* Whoops, must trade for write lock */
btrfs_tree_read_unlock(b);
free_extent_buffer(b);
}
b = btrfs_lock_root_node(root);
root_lock = BTRFS_WRITE_LOCK;
/* The level might have changed, check again */
level = btrfs_header_level(b);
out:
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
/*
* Callers are responsible for dropping b's references.
*/
return b;
}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
* modifications to preserve tree invariants.
......@@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int err;
int level;
int lowest_unlock = 1;
int root_lock;
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
......@@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
/*
* we try very hard to do read locks on the root
*/
root_lock = BTRFS_READ_LOCK;
level = 0;
if (p->search_commit_root) {
/*
* the commit roots are read only
* so we always do read locks
*/
if (p->need_commit_sem)
down_read(&fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
if (p->need_commit_sem)
up_read(&fs_info->commit_root_sem);
if (!p->skip_locking)
btrfs_tree_read_lock(b);
} else {
if (p->skip_locking) {
b = btrfs_root_node(root);
level = btrfs_header_level(b);
} else {
/* we don't know the level of the root node
* until we actually have it read locked
*/
b = btrfs_read_lock_root_node(root);
level = btrfs_header_level(b);
if (level <= write_lock_level) {
/* whoops, must trade for write lock */
btrfs_tree_read_unlock(b);
free_extent_buffer(b);
b = btrfs_lock_root_node(root);
root_lock = BTRFS_WRITE_LOCK;
/* the level might have changed, check again */
level = btrfs_header_level(b);
}
}
}
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
while (b) {
level = btrfs_header_level(b);
......
......@@ -739,6 +739,12 @@ struct btrfs_delayed_root;
*/
#define BTRFS_FS_NEED_ASYNC_COMMIT 17
/*
* Indicate that balance has been set up from the ioctl and is in the main
* phase. The fs_info::balance_ctl is initialized.
*/
#define BTRFS_FS_BALANCE_RUNNING 18
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
......@@ -838,7 +844,6 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
/*
* this is taken to make sure we don't set block groups ro after
......@@ -1004,7 +1009,6 @@ struct btrfs_fs_info {
/* restriper state */
spinlock_t balance_lock;
struct mutex balance_mutex;
atomic_t balance_running;
atomic_t balance_pause_req;
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
......@@ -1219,9 +1223,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_cleanup_state;
spinlock_t inode_lock;
......@@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
void btrfs_orphan_release_metadata(struct btrfs_inode *inode);
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems,
u64 *qgroup_reserved, bool use_global_rsv);
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
......@@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, const u64 type);
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *info, u64 start, u64 end);
u64 start, u64 end);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
......@@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* uuid-tree.c */
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
......@@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_map *em);
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
int delay_iput;
struct completion completion;
struct list_head list;
struct btrfs_work work;
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start,
u64 len, int create);
......@@ -3193,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
......@@ -3204,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
int nr);
int btrfs_start_delalloc_inodes(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state, int dedupe);
......@@ -3240,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
......@@ -3262,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode);
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff);
......@@ -3767,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
return 0;
}
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
* This implies a full smp_mb barrier, see comments for
* waitqueue_active why.
*/
if (wq_has_sleeper(wq))
wake_up(wq);
}
static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
{
/*
* Special case for conditional wakeup where the barrier required for
* waitqueue_active is implied by some of the preceding code. Eg. one
* of such atomic operations (atomic_dec_and_return, ...), or a
* unlock/lock sequence, etc.
*/
if (waitqueue_active(wq))
wake_up(wq);
}
#endif
......@@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
/*
* atomic_dec_return implies a barrier for waitqueue_active
*/
/* atomic_dec_return implies a barrier */
if ((atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
waitqueue_active(&delayed_root->wait))
wake_up(&delayed_root->wait);
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
cond_wake_up_nomb(&delayed_root->wait);
}
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
......
This diff is collapsed.
......@@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
......@@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
struct btrfs_delayed_ref_head *
btrfs_select_ref_head(struct btrfs_trans_handle *trans);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
/*
* helper functions to cast a node into its container
......
......@@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
......@@ -178,6 +176,105 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
return ret;
}
/*
* Initialize a new device for device replace target from a given source dev
* and path.
*
* Return 0 and new device in @device_out, otherwise return < 0
*/
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
const char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
if (fs_info->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
if (i_size_read(bdev->bd_inode) <
btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
goto error;
}
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
*device_out = device;
return 0;
error:
blkdev_put(bdev, FMODE_EXCL);
return ret;
}
/*
* called from commit_transaction. Writes changed device replace state to
* disk.
......@@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name, &src_device);
if (ret) {
mutex_unlock(&fs_info->volume_mutex);
if (ret)
return ret;
}
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
mutex_unlock(&fs_info->volume_mutex);
if (ret)
return ret;
......@@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
btrfs_info_in_rcu(fs_info,
......@@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
......@@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
......@@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
......@@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
......@@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_write_unlock(dev_replace);
WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
/*
* This could collide with a paused balance, but the exclusive op logic
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
}
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
......@@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 progress;
int ret;
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
......@@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
btrfs_dev_replace_continue_on_mount(fs_info);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret;
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
......@@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking(
ASSERT(atomic_read(&dev_replace->read_locks) > 0);
ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
read_lock(&dev_replace->lock);
if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
waitqueue_active(&dev_replace->read_lock_wq))
wake_up(&dev_replace->read_lock_wq);
/* Barrier implied by atomic_dec_and_test */
if (atomic_dec_and_test(&dev_replace->blocking_readers))
cond_wake_up_nomb(&dev_replace->read_lock_wq);
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
......@@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->bio_counter, amount);
if (waitqueue_active(&fs_info->replace_wait))
wake_up(&fs_info->replace_wait);
cond_wake_up_nomb(&fs_info->replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
......
This diff is collapsed.
This diff is collapsed.
......@@ -4106,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
return ret;
}
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
.tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
......@@ -4123,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree,
return ret;
}
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
......@@ -4133,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct page *pagepool[16];
struct page *page;
struct extent_map *em_cached = NULL;
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
......@@ -4199,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
static int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree,
static int try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
......@@ -4235,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask)
int try_release_extent_mapping(struct page *page, gfp_t mask)
{
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree;
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
......@@ -4275,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
free_extent_map(em);
}
}
return try_release_extent_state(map, tree, page, mask);
return try_release_extent_state(tree, page, mask);
}
/*
......@@ -5617,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
void le_bitmap_set(u8 *map, unsigned int start, int len)
{
u8 *p = map + BIT_BYTE(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
*p |= mask_to_set;
}
}
void le_bitmap_clear(u8 *map, unsigned int start, int len)
{
u8 *p = map + BIT_BYTE(start);
const unsigned int size = start + len;
int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
mask_to_clear = ~0;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
*p &= ~mask_to_clear;
}
}
/*
* eb_bitmap_offset() - calculate the page and offset of the byte containing the
* given bit number
......
......@@ -79,14 +79,6 @@
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
static inline int le_test_bit(int nr, const u8 *addr)
{
return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
}
void le_bitmap_set(u8 *map, unsigned int start, int len);
void le_bitmap_clear(u8 *map, unsigned int start, int len);
struct extent_state;
struct btrfs_root;
struct btrfs_inode;
......@@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int create);
void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
......@@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
......
......@@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
/**
* btrfs_add_extent_mapping - add extent mapping into em_tree
* @fs_info - used for tracepoint
* @em_tree - the extent tree into which we want to insert the extent mapping
* @em_in - extent we are inserting
* @start - start of the logical range btrfs_get_extent() is requesting
......@@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
......@@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
existing = search_extent_mapping(em_tree, start, len);
trace_btrfs_handle_em_exist(existing, em, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
/*
* existing will always be non-NULL, since there must be
......
......@@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
#endif
......@@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
truncate_pagecache(inode, 0);
/*
* We don't need an orphan item because truncating the free space cache
* will never be split across transactions.
* We don't need to check for -EAGAIN because we're a free space
* cache inode
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
......
This diff is collapsed.
......@@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
......@@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int free_space_test_bit(struct btrfs_block_group_cache *block_group,
......
This diff is collapsed.
This diff is collapsed.
......@@ -66,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
/*
* atomic_dec_and_test implies a barrier for waitqueue_active
*/
if (atomic_dec_and_test(&eb->blocking_writers) &&
waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_writers))
cond_wake_up_nomb(&eb->write_lock_wq);
} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
/*
* atomic_dec_and_test implies a barrier for waitqueue_active
*/
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_readers))
cond_wake_up_nomb(&eb->read_lock_wq);
}
}
......@@ -221,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
/*
* atomic_dec_and_test implies a barrier for waitqueue_active
*/
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_readers))
cond_wake_up_nomb(&eb->read_lock_wq);
atomic_dec(&eb->read_locks);
}
......@@ -275,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
/*
* Make sure counter is updated before we wake up waiters.
*/
/* Use the lighter barrier after atomic */
smp_mb__after_atomic();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
cond_wake_up_nomb(&eb->write_lock_wq);
} else {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
......
......@@ -17,6 +17,43 @@
#define LZO_LEN 4
/*
* Btrfs LZO compression format
*
* Regular and inlined LZO compressed data extents consist of:
*
* 1. Header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size (including the header) of compressed data.
*
* 2. Segment(s)
* Variable size. Each segment includes one segment header, followd by data
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
* One segment represents at most one page of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
* Segment header never crosses page boundary, thus it's possible to
* have at most 3 padding zeros at the end of the page.
*
* 2.2 Data Payload
* Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
* which is 4419 for a 4KiB page.
*
* Example:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
* ...
* 0x0ff0 | SegHdr N | Data payload N ... |00|
* ^^ padding zeros
* Page 2:
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
......@@ -258,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long working_bytes;
size_t in_len;
size_t out_len;
const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
unsigned long in_offset;
unsigned long in_page_bytes_left;
unsigned long tot_in;
......@@ -271,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
/*
* Compressed data header check.
*
* The real compressed size can't exceed the maximum extent length, and
* all pages should be used (whole unused page with just the segment
* header is not possible). If this happens it means the compressed
* extent is corrupted.
*/
if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
tot_len < srclen - PAGE_SIZE) {
ret = -EUCLEAN;
goto done;
}
tot_in = LZO_LEN;
in_offset = LZO_LEN;
tot_len = min_t(size_t, srclen, tot_len);
in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
......@@ -285,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
in_offset += LZO_LEN;
tot_in += LZO_LEN;
/*
* Segment header check.
*
* The segment length must not exceed the maximum LZO
* compression size, nor the total compressed size.
*/
if (in_len > max_segment_len || tot_in + in_len > tot_len) {
ret = -EUCLEAN;
goto done;
}
tot_in += in_len;
working_bytes = in_len;
may_late_unmap = need_unmap = false;
......@@ -335,7 +396,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
}
out_len = lzo1x_worst_compress(PAGE_SIZE);
out_len = max_segment_len;
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (need_unmap)
......@@ -369,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
int ret = 0;
char *kaddr;
unsigned long bytes;
BUG_ON(srclen < LZO_LEN);
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
in_len = read_compress_length(data_in);
if (in_len != srclen)
return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
if (in_len != srclen - LZO_LEN * 2) {
ret = -EUCLEAN;
goto out;
}
data_in += LZO_LEN;
out_len = PAGE_SIZE;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment