Commit 97a851ed authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: use io_end for multiple bios

Change writeback path to create just one io_end structure for the
extent to which we submit IO and share it among bios writing that
extent. This prevents needless splitting and joining of unwritten
extents when they cannot be submitted as a single bio.

Bugs in ENOMEM handling found by Linux File System Verification project
(linuxtesting.org) and fixed by Alexey Khoroshilov
<khoroshilov@ispras.ru>.

CC: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 8af8eecc
...@@ -209,6 +209,7 @@ typedef struct ext4_io_end { ...@@ -209,6 +209,7 @@ typedef struct ext4_io_end {
ssize_t size; /* size of the extent */ ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */ struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */ int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t; } ext4_io_end_t;
struct ext4_io_submit { struct ext4_io_submit {
...@@ -2648,11 +2649,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, ...@@ -2648,11 +2649,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */ /* page-io.c */
extern int __init ext4_init_pageio(void); extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void); extern void ext4_exit_pageio(void);
extern void ext4_ioend_shutdown(struct inode *); extern void ext4_ioend_shutdown(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_work(struct work_struct *work); extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io); extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io, extern int ext4_bio_write_page(struct ext4_io_submit *io,
......
...@@ -1465,6 +1465,8 @@ static void ext4_da_page_release_reservation(struct page *page, ...@@ -1465,6 +1465,8 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff * Delayed allocation stuff
*/ */
static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
/* /*
* mpage_da_submit_io - walks through extent of pages and try to write * mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back * them with writepage() call back
...@@ -1493,7 +1495,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, ...@@ -1493,7 +1495,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
struct ext4_io_submit io_submit; struct ext4_io_submit io_submit;
BUG_ON(mpd->next_page <= mpd->first_page); BUG_ON(mpd->next_page <= mpd->first_page);
memset(&io_submit, 0, sizeof(io_submit)); ext4_io_submit_init(&io_submit, mpd->wbc);
io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_submit.io_end) {
ext4_da_block_invalidatepages(mpd);
return -ENOMEM;
}
/* /*
* We need to start from the first_page to the next_page - 1 * We need to start from the first_page to the next_page - 1
* to make sure we also write the mapped dirty buffer_heads. * to make sure we also write the mapped dirty buffer_heads.
...@@ -1581,6 +1588,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, ...@@ -1581,6 +1588,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
pagevec_release(&pvec); pagevec_release(&pvec);
} }
ext4_io_submit(&io_submit); ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
return ret; return ret;
} }
...@@ -2239,9 +2248,17 @@ static int ext4_writepage(struct page *page, ...@@ -2239,9 +2248,17 @@ static int ext4_writepage(struct page *page,
*/ */
return __ext4_journalled_writepage(page, len); return __ext4_journalled_writepage(page, len);
memset(&io_submit, 0, sizeof(io_submit)); ext4_io_submit_init(&io_submit, wbc);
io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_submit.io_end) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
ret = ext4_bio_write_page(&io_submit, page, len, wbc); ret = ext4_bio_write_page(&io_submit, page, len, wbc);
ext4_io_submit(&io_submit); ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
return ret; return ret;
} }
...@@ -3076,9 +3093,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ...@@ -3076,9 +3093,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
struct inode *inode = file_inode(iocb->ki_filp); struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private; ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO or dio with 0 bytes write, just return */ /* if not async direct IO just return */
if (!io_end || !size) if (!io_end) {
goto out; inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
return;
}
ext_debug("ext4_end_io_dio(): io_end 0x%p " ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n", "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
...@@ -3086,25 +3107,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ...@@ -3086,25 +3107,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size); size);
iocb->private = NULL; iocb->private = NULL;
/* if not aio dio with unwritten extents, just free io and return */
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
out:
inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
return;
}
io_end->offset = offset; io_end->offset = offset;
io_end->size = size; io_end->size = size;
if (is_async) { if (is_async) {
io_end->iocb = iocb; io_end->iocb = iocb;
io_end->result = ret; io_end->result = ret;
} }
ext4_put_io_end_defer(io_end);
ext4_add_complete_io(io_end);
} }
/* /*
...@@ -3138,6 +3147,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ...@@ -3138,6 +3147,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_t *get_block_func = NULL; get_block_t *get_block_func = NULL;
int dio_flags = 0; int dio_flags = 0;
loff_t final_size = offset + count; loff_t final_size = offset + count;
ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */ /* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size) if (rw != WRITE || final_size > inode->i_size)
...@@ -3176,13 +3186,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ...@@ -3176,13 +3186,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL; iocb->private = NULL;
ext4_inode_aio_set(inode, NULL); ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) { if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) { if (!io_end) {
ret = -ENOMEM; ret = -ENOMEM;
goto retake_lock; goto retake_lock;
} }
io_end->flag |= EXT4_IO_END_DIRECT; io_end->flag |= EXT4_IO_END_DIRECT;
iocb->private = io_end; /*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
iocb->private = ext4_get_io_end(io_end);
/* /*
* we save the io structure for current async direct * we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the * IO, so that later ext4_map_blocks() could flag the
...@@ -3206,26 +3219,35 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ...@@ -3206,26 +3219,35 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL, NULL,
dio_flags); dio_flags);
if (iocb->private) /*
* Put our reference to io_end. This can free the io_end structure e.g.
* in sync IO case or in case of error. It can even perform extent
* conversion if all bios we submitted finished before we got here.
* Note that in that case iocb->private can be already set to NULL
* here.
*/
if (io_end) {
ext4_inode_aio_set(inode, NULL); ext4_inode_aio_set(inode, NULL);
ext4_put_io_end(io_end);
/* /*
* The io_end structure takes a reference to the inode, that * When no IO was submitted ext4_end_io_dio() was not
* structure needs to be destroyed and the reference to the * called so we have to put iocb's reference.
* inode need to be dropped, when IO is complete, even with 0
* byte write, or failed.
*
* In the successful AIO DIO case, the io_end structure will
* be destroyed and the reference to the inode will be dropped
* after the end_io call back function is called.
*
* In the case there is 0 byte write, or error case, since VFS
* direct IO won't invoke the end_io call back function, we
* need to free the end_io structure here.
*/ */
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
ext4_free_io_end(iocb->private); WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
WARN_ON(io_end->iocb);
/*
* Generic code already did inode_dio_done() so we
* have to clear EXT4_IO_END_DIRECT to not do it for
* the second time.
*/
io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL; iocb->private = NULL;
} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, }
}
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) { EXT4_STATE_DIO_UNWRITTEN)) {
int err; int err;
/* /*
......
...@@ -62,15 +62,28 @@ void ext4_ioend_shutdown(struct inode *inode) ...@@ -62,15 +62,28 @@ void ext4_ioend_shutdown(struct inode *inode)
cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
} }
void ext4_free_io_end(ext4_io_end_t *io) static void ext4_release_io_end(ext4_io_end_t *io_end)
{ {
BUG_ON(!io); BUG_ON(!list_empty(&io_end->list));
BUG_ON(!list_empty(&io->list)); BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
wake_up_all(ext4_ioend_wq(io_end->inode));
if (io_end->flag & EXT4_IO_END_DIRECT)
inode_dio_done(io_end->inode);
if (io_end->iocb)
aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
wake_up_all(ext4_ioend_wq(io->inode)); /* Wake up anyone waiting on unwritten extent conversion */
kmem_cache_free(io_end_cachep, io); if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
} }
/* check a range of space and convert unwritten extents to written. */ /* check a range of space and convert unwritten extents to written. */
...@@ -93,13 +106,8 @@ static int ext4_end_io(ext4_io_end_t *io) ...@@ -93,13 +106,8 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)", "(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret); inode->i_ino, offset, size, ret);
} }
/* Wake up anyone waiting on unwritten extent conversion */ ext4_clear_io_unwritten_flag(io);
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) ext4_release_io_end(io);
wake_up_all(ext4_ioend_wq(inode));
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
return ret; return ret;
} }
...@@ -130,7 +138,7 @@ static void dump_completed_IO(struct inode *inode) ...@@ -130,7 +138,7 @@ static void dump_completed_IO(struct inode *inode)
} }
/* Add the io_end to per-inode completed end_io list. */ /* Add the io_end to per-inode completed end_io list. */
void ext4_add_complete_io(ext4_io_end_t *io_end) static void ext4_add_complete_io(ext4_io_end_t *io_end)
{ {
struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq; struct workqueue_struct *wq;
...@@ -167,8 +175,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) ...@@ -167,8 +175,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io); err = ext4_end_io(io);
if (unlikely(!ret && err)) if (unlikely(!ret && err))
ret = err; ret = err;
io->flag &= ~EXT4_IO_END_UNWRITTEN;
ext4_free_io_end(io);
} }
return ret; return ret;
} }
...@@ -200,10 +206,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) ...@@ -200,10 +206,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count); atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode; io->inode = inode;
INIT_LIST_HEAD(&io->list); INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
} }
return io; return io;
} }
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
ext4_release_io_end(io_end);
return;
}
ext4_add_complete_io(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_extents(io_end->inode,
io_end->offset, io_end->size);
ext4_clear_io_unwritten_flag(io_end);
}
ext4_release_io_end(io_end);
}
return err;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
atomic_inc(&io_end->count);
return io_end;
}
/* /*
* Print an buffer I/O error compatible with the fs/buffer.c. This * Print an buffer I/O error compatible with the fs/buffer.c. This
* provides compatibility with dmesg scrapers that look for a specific * provides compatibility with dmesg scrapers that look for a specific
...@@ -286,12 +325,7 @@ static void ext4_end_bio(struct bio *bio, int error) ...@@ -286,12 +325,7 @@ static void ext4_end_bio(struct bio *bio, int error)
bi_sector >> (inode->i_blkbits - 9)); bi_sector >> (inode->i_blkbits - 9));
} }
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_put_io_end_defer(io_end);
ext4_free_io_end(io_end);
return;
}
ext4_add_complete_io(io_end);
} }
void ext4_io_submit(struct ext4_io_submit *io) void ext4_io_submit(struct ext4_io_submit *io)
...@@ -305,40 +339,37 @@ void ext4_io_submit(struct ext4_io_submit *io) ...@@ -305,40 +339,37 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio); bio_put(io->io_bio);
} }
io->io_bio = NULL; io->io_bio = NULL;
io->io_op = 0; }
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_bio = NULL;
io->io_end = NULL; io->io_end = NULL;
} }
static int io_submit_init(struct ext4_io_submit *io, static int io_submit_init_bio(struct ext4_io_submit *io,
struct inode *inode,
struct writeback_control *wbc,
struct buffer_head *bh) struct buffer_head *bh)
{ {
ext4_io_end_t *io_end;
struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev); int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio; struct bio *bio;
io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end)
return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev; bio->bi_bdev = bh->b_bdev;
bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio; bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); if (!io->io_end->size)
io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
+ bh_offset(bh);
io->io_bio = bio; io->io_bio = bio;
io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr; io->io_next_block = bh->b_blocknr;
return 0; return 0;
} }
static int io_submit_add_bh(struct ext4_io_submit *io, static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode, struct inode *inode,
struct writeback_control *wbc,
struct buffer_head *bh) struct buffer_head *bh)
{ {
ext4_io_end_t *io_end; ext4_io_end_t *io_end;
...@@ -349,18 +380,18 @@ static int io_submit_add_bh(struct ext4_io_submit *io, ...@@ -349,18 +380,18 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
ext4_io_submit(io); ext4_io_submit(io);
} }
if (io->io_bio == NULL) { if (io->io_bio == NULL) {
ret = io_submit_init(io, inode, wbc, bh); ret = io_submit_init_bio(io, bh);
if (ret) if (ret)
return ret; return ret;
} }
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
io_end = io->io_end; io_end = io->io_end;
if (test_clear_buffer_uninit(bh)) if (test_clear_buffer_uninit(bh))
ext4_set_io_unwritten_flag(inode, io_end); ext4_set_io_unwritten_flag(inode, io_end);
io->io_end->size += bh->b_size; io_end->size += bh->b_size;
io->io_next_block++; io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
return 0; return 0;
} }
...@@ -432,7 +463,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ...@@ -432,7 +463,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do { do {
if (!buffer_async_write(bh)) if (!buffer_async_write(bh))
continue; continue;
ret = io_submit_add_bh(io, inode, wbc, bh); ret = io_submit_add_bh(io, inode, bh);
if (ret) { if (ret) {
/* /*
* We only get here on ENOMEM. Not much else * We only get here on ENOMEM. Not much else
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment