Commit 7e64c86c authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Buffered write path now can avoid the inode lock

Non append, non extending buffered writes can now avoid taking the inode
lock.

To ensure atomicity of writes w.r.t. other writes, we lock every folio
that we'll be writing to, and if this fails we fall back to taking the
inode lock.

Extensive comments are provided as to corner cases.

Link: https://lore.kernel.org/linux-fsdevel/Zdkxfspq3urnrM6I@bombadil.infradead.org/Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 66a67c86
...@@ -250,7 +250,8 @@ ...@@ -250,7 +250,8 @@
x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, need_inode_lock)
enum bch_errcode { enum bch_errcode {
BCH_ERR_START = 2048, BCH_ERR_START = 2048,
......
...@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) ...@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode, static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping, struct address_space *mapping,
struct iov_iter *iter, struct iov_iter *iter,
loff_t pos, unsigned len) loff_t pos, unsigned len,
bool inode_locked)
{ {
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res; struct bch2_folio_reservation res;
...@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, ...@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr); BUG_ON(!fs.nr);
/*
* If we're not using the inode lock, we need to lock all the folios for
* atomiticity of writes vs. other writes:
*/
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
ret = -BCH_ERR_need_inode_lock;
goto out;
}
f = darray_first(fs); f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) { if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping); ret = bch2_read_single_folio(f, mapping);
...@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, ...@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied; end = pos + copied;
spin_lock(&inode->v.i_lock); spin_lock(&inode->v.i_lock);
if (end > inode->v.i_size) if (end > inode->v.i_size) {
BUG_ON(!inode_locked);
i_size_write(&inode->v, end); i_size_write(&inode->v, end);
}
spin_unlock(&inode->v.i_lock); spin_unlock(&inode->v.i_lock);
f_pos = pos; f_pos = pos;
...@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ...@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file); struct bch_inode_info *inode = file_bch_inode(file);
loff_t pos = iocb->ki_pos; loff_t pos;
ssize_t written = 0; bool inode_locked = false;
int ret = 0; ssize_t written = 0, written2 = 0, ret = 0;
/*
* We don't take the inode lock unless i_size will be changing. Folio
* locks provide exclusion with other writes, and the pagecache add lock
* provides exclusion with truncate and hole punching.
*
* There is one nasty corner case where atomicity would be broken
* without great care: when copying data from userspace to the page
* cache, we do that with faults disable - a page fault would recurse
* back into the filesystem, taking filesystem locks again, and
* deadlock; so it's done with faults disabled, and we fault in the user
* buffer when we aren't holding locks.
*
* If we do part of the write, but we then race and in the userspace
* buffer have been evicted and are no longer resident, then we have to
* drop our folio locks to re-fault them in, breaking write atomicity.
*
* To fix this, we restart the write from the start, if we weren't
* holding the inode lock.
*
* There is another wrinkle after that; if we restart the write from the
* start, and then get an unrecoverable error, we _cannot_ claim to
* userspace that we did not write data we actually did - so we must
* track (written2) the most we ever wrote.
*/
if ((iocb->ki_flags & IOCB_APPEND) ||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
inode_lock(&inode->v);
inode_locked = true;
}
ret = generic_write_checks(iocb, iter);
if (ret <= 0)
goto unlock;
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
if (ret) {
if (!inode_locked) {
inode_lock(&inode->v);
inode_locked = true;
ret = file_remove_privs_flags(file, 0);
}
if (ret)
goto unlock;
}
ret = file_update_time(file);
if (ret)
goto unlock;
pos = iocb->ki_pos;
bch2_pagecache_add_get(inode); bch2_pagecache_add_get(inode);
if (!inode_locked &&
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
goto get_inode_lock;
do { do {
unsigned offset = pos & (PAGE_SIZE - 1); unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter); unsigned bytes = iov_iter_count(iter);
...@@ -1004,12 +1072,17 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ...@@ -1004,12 +1072,17 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
} }
} }
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
goto get_inode_lock;
if (unlikely(fatal_signal_pending(current))) { if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR; ret = -EINTR;
break; break;
} }
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
if (ret == -BCH_ERR_need_inode_lock)
goto get_inode_lock;
if (unlikely(ret < 0)) if (unlikely(ret < 0))
break; break;
...@@ -1030,50 +1103,46 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ...@@ -1030,50 +1103,46 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
} }
pos += ret; pos += ret;
written += ret; written += ret;
written2 = max(written, written2);
if (ret != bytes && !inode_locked)
goto get_inode_lock;
ret = 0; ret = 0;
balance_dirty_pages_ratelimited(mapping); balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
if (0) {
get_inode_lock:
bch2_pagecache_add_put(inode);
inode_lock(&inode->v);
inode_locked = true;
bch2_pagecache_add_get(inode);
iov_iter_revert(iter, written);
pos -= written;
written = 0;
ret = 0;
}
} while (iov_iter_count(iter));
bch2_pagecache_add_put(inode); bch2_pagecache_add_put(inode);
unlock:
if (inode_locked)
inode_unlock(&inode->v);
iocb->ki_pos += written;
return written ? written : ret; ret = max(written, written2) ?: ret;
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
} }
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{ {
struct file *file = iocb->ki_filp; ssize_t ret = iocb->ki_flags & IOCB_DIRECT
struct bch_inode_info *inode = file_bch_inode(file); ? bch2_direct_write(iocb, iter)
ssize_t ret; : bch2_buffered_write(iocb, iter);
if (iocb->ki_flags & IOCB_DIRECT) {
ret = bch2_direct_write(iocb, from);
goto out;
}
inode_lock(&inode->v);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto unlock;
ret = file_remove_privs(file);
if (ret)
goto unlock;
ret = file_update_time(file);
if (ret)
goto unlock;
ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
unlock:
inode_unlock(&inode->v);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
out:
return bch2_err_class(ret); return bch2_err_class(ret);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment