Commit d4135134 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba

btrfs: avoid blocking on space revervation when doing nowait dio writes

When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.

So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.

This is part of a patchset comprised of the following patches:

  btrfs: avoid blocking on page locks with nowait dio on compressed range
  btrfs: avoid blocking nowait dio when locking file range
  btrfs: avoid double nocow check when doing nowait dio writes
  btrfs: stop allocating a path when checking if cross reference exists
  btrfs: free path at can_nocow_extent() before checking for checksum items
  btrfs: release path earlier at can_nocow_extent()
  btrfs: avoid blocking when allocating context for nowait dio read/write
  btrfs: avoid blocking on space revervation when doing nowait dio writes

The following test was run before and after applying this patchset:

  $ cat io-uring-nodatacow-test.sh
  #!/bin/bash

  DEV=/dev/sdc
  MNT=/mnt/sdc

  MOUNT_OPTIONS="-o ssd -o nodatacow"
  MKFS_OPTIONS="-R free-space-tree -O no-holes"

  NUM_JOBS=4
  FILE_SIZE=8G
  RUN_TIME=300

  cat <<EOF > /tmp/fio-job.ini
  [io_uring_rw]
  rw=randrw
  fsync=0
  fallocate=posix
  group_reporting=1
  direct=1
  ioengine=io_uring
  iodepth=64
  bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
  filesize=$FILE_SIZE
  runtime=$RUN_TIME
  time_based
  filename=foobar
  directory=$MNT
  numjobs=$NUM_JOBS
  thread
  EOF

  echo performance | \
     tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $MNT &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
  mount $MOUNT_OPTIONS $DEV $MNT

  fio /tmp/fio-job.ini

  umount $MNT

The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.

Result before the patchset:

 READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec

Result after the patchset:

 READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec

That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 4f208dcc
......@@ -2893,7 +2893,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes);
u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
......
......@@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes)
u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
......@@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
......@@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
......@@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len);
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
......
......@@ -1684,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
reserve_bytes);
reserve_bytes, false);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
......
......@@ -4705,7 +4705,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
......@@ -7415,6 +7415,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
u64 start, u64 len,
unsigned int iomap_flags)
{
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
int type;
......@@ -7454,12 +7455,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(fs_info, block_start);
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
ret = -EAGAIN;
goto out;
}
space_reserved = true;
......@@ -7483,7 +7487,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
free_extent_map(em);
*map = NULL;
if (iomap_flags & IOMAP_NOWAIT)
if (nowait)
return -EAGAIN;
/* We have to COW, so need to reserve metadata and data space. */
......@@ -10801,7 +10805,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
if (ret)
goto out_free_data_space;
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
false);
if (ret)
goto out_qgroup_free_data;
......
......@@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
enum btrfs_qgroup_rsv_type type, bool enforce,
bool noflush)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
if (ret <= 0 && ret != -EDQUOT)
if ((ret <= 0 && ret != -EDQUOT) || noflush)
return ret;
ret = try_flush_qgroup(root);
......
......@@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
enum btrfs_qgroup_rsv_type type, bool enforce,
bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
BTRFS_QGROUP_RSV_META_PERTRANS,
enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
int num_bytes, bool enforce)
int num_bytes, bool enforce,
bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
BTRFS_QGROUP_RSV_META_PREALLOC,
enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
......
......@@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
clamped_len, clamped_len);
clamped_len, clamped_len,
false);
if (ret)
goto release_page;
......
......@@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true);
qgroup_num_bytes, true,
false);
if (ret)
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment