Commit 9f44fdc5 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: Fix time encoding with extra epoch bits
  ext4: Add a stub for mpage_da_data in the trace header
  jbd2: Use tracepoints for history file
  ext4: Use tracepoints for mb_history trace file
  ext4, jbd2: Drop unneeded printks at mount and unmount time
  ext4: Handle nested ext4_journal_start/stop calls without a journal
  ext4: Make sure ext4_dirty_inode() updates the inode in no journal mode
  ext4: Avoid updating the inode table bh twice in no journal mode
  ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first
  ext4: async direct IO for holes and fallocate support
  ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O
  ext4: Split uninitialized extents for direct I/O
  ext4: release reserved quota when block reservation for delalloc retry
  ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks
  ext4: Fix hueristic which avoids group preallocation for closed files
  ext4: Use ext4_msg() for ext4_da_writepage() errors
  ext4: Update documentation about quota mount options
parents 4c8f1cb2 c1fccc06
......@@ -282,9 +282,16 @@ stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
delalloc (*) Deferring block allocation until write-out time.
nodelalloc Disable delayed allocation. Blocks are allocation
when data is copied from user to page cache.
delalloc (*) Defer block allocation until just before ext4
writes out the block(s) in question. This
allows ext4 to better allocation decisions
more efficiently.
nodelalloc Disable delayed allocation. Blocks are allocated
when the data is copied from userspace to the
page cache, either via the write(2) system call
or when an mmap'ed page which was previously
unallocated is written for the first time.
max_batch_time=usec Maximum amount of time ext4 should wait for
additional filesystem operations to be batch
......
......@@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
..............................................................................
File Content
mb_groups details of multiblock allocator buddy cache of free blocks
mb_history multiblock allocation history
..............................................................................
......
......@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
/*
* Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
* flag field is exported via the traceport interface
*/
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
......@@ -127,6 +133,16 @@ struct mpage_da_data {
int pages_written;
int retval;
};
#define DIO_AIO_UNWRITTEN 0x1
typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
int error; /* I/O error code */
ext4_lblk_t offset; /* offset in the file */
size_t size; /* size of the extent */
struct work_struct work; /* data work queue */
} ext4_io_end_t;
/*
* Special inodes numbers
......@@ -347,7 +363,16 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_DIO 0x0010
#define EXT4_GET_BLOCKS_CONVERT 0x0020
#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after direct IO complete */
#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_DIO_CREATE_EXT)
/*
* ioctl commands
......@@ -500,8 +525,8 @@ struct move_extent {
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
time->tv_sec >> 32 : 0) |
((time->tv_nsec << 2) & EXT4_NSEC_MASK));
(time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
......@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
if (sizeof(time->tv_sec) > 4)
time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
<< 32;
time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
......@@ -672,6 +697,11 @@ struct ext4_inode_info {
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
/* completed async DIOs that might need unwritten extents handling */
struct list_head i_aio_dio_complete_list;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
};
/*
......@@ -942,18 +972,11 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_writeback_mb_bump;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
/* history to debug policy */
struct ext4_mb_history *s_mb_history;
int s_mb_history_cur;
int s_mb_history_max;
int s_mb_history_num;
spinlock_t s_mb_history_lock;
int s_mb_history_filter;
/* stats for buddy allocator */
spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
......@@ -980,6 +1003,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
......@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
......@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
......
......@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
{
ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
......@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
......
......@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
/* Note: Do not use this for NULL handles. This is only to determine if
* a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
if (handle == EXT4_NOJOURNAL_HANDLE)
if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
......
......@@ -723,7 +723,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
......@@ -1586,7 +1586,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *newext)
struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
......@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
......@@ -1722,7 +1723,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
merge:
/* try to merge extents to the right */
ext4_ext_try_to_merge(inode, path, nearex);
if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
......@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
*/
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
......@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
printk(", stats");
#endif
printk("\n");
#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
EXT4_SB(sb)->s_ext_min = 1 << 30;
......@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
......@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
err = ext4_ext_insert_extent(handle, inode, path, ex3);
err = ext4_ext_insert_extent(handle, inode, path,
ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
......@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
......@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
......@@ -2784,6 +2788,324 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
return err;
}
/*
* This function is called by ext4_ext_get_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
* to an uninitialized extent.
*
* Writing to an uninitized extent may result in splitting the uninitialized
* extent into multiple /intialized unintialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
* One of more index blocks maybe needed if the extent tree grow after
* the unintialized extent split. To prevent ENOSPC occur at the IO
* complete, we need to split the uninitialized extent before DIO submit
* the IO. The uninitilized extent called at this time will be split
* into three uninitialized extent(at most). After IO complete, the part
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*/
static int ext4_split_unwritten_extents(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t iblock,
unsigned int max_blocks,
int flags)
{
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex1 = NULL;
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
ext4_lblk_t ee_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int ret = 0;
ext_debug("ext4_split_unwritten_extents: inode %lu,"
"iblock %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)iblock, max_blocks);
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (iblock - ee_block);
newblock = iblock - ee_block + ext_pblock(ex);
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
/*
* if the entire unintialized extent length less than
* the size of extent to write, there is no need to split
* uninitialized extent
*/
if (allocated <= max_blocks)
return ret;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* ex1: ee_block to iblock - 1 : uninitialized */
if (iblock > ee_block) {
ex1 = ex;
ex1->ee_len = cpu_to_le16(iblock - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
* for sanity, update the length of the ex2 extent before
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
if (!ex1 && allocated > max_blocks)
ex2->ee_len = cpu_to_le16(max_blocks);
/* ex3: to ee_block + ee_len : uninitialised */
if (allocated > max_blocks) {
unsigned int newdepth;
ex3 = &newex;
ex3->ee_block = cpu_to_le32(iblock + max_blocks);
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
/* blocks available from iblock */
return allocated;
} else if (err)
goto fix_extent_len;
/*
* The depth, and hence eh & ex might change
* as part of the insert above.
*/
newdepth = ext_depth(inode);
/*
* update the extent length after successful insert of the
* split extent
*/
orig_ex.ee_len = cpu_to_le16(ee_len -
ext4_ext_get_actual_len(ex3));
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, iblock, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
}
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
if (ex2 != &newex)
ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
allocated = max_blocks;
}
/*
* If there was a change of depth as part of the
* insertion of ex3 above, we need to update the length
* of the ex1 extent again here
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
ex1->ee_len = cpu_to_le16(iblock - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
* ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
* uninitialised still.
*/
ex2->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
ext4_ext_mark_uninitialized(ex2);
if (ex2 != ex)
goto insert;
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
ext_debug("out here\n");
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
return allocated;
} else if (err)
goto fix_extent_len;
out:
ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
ex->ee_block = orig_ex.ee_block;
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_mark_uninitialized(ex);
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
static int ext4_convert_unwritten_extents_dio(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path)
{
struct ext4_extent *ex;
struct ext4_extent_header *eh;
int depth;
int err = 0;
int ret = 0;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
/*
* We have to see if it can be merged with the extent
* on the left.
*/
if (ex > EXT_FIRST_EXTENT(eh)) {
/*
* To merge left, pass "ex - 1" to try_to_merge(),
* since it merges towards right _only_.
*/
ret = ext4_ext_try_to_merge(inode, path, ex - 1);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
depth = ext_depth(inode);
ex--;
}
}
/*
* Try to Merge towards right.
*/
ret = ext4_ext_try_to_merge(inode, path, ex);
if (ret) {
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto out;
depth = ext_depth(inode);
}
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
out:
ext4_ext_show_leaf(inode, path);
return err;
}
static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, unsigned int max_blocks,
struct ext4_ext_path *path, int flags,
unsigned int allocated, struct buffer_head *bh_result,
ext4_fsblk_t newblock)
{
int ret = 0;
int err = 0;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
"block %llu, max_blocks %u, flags %d, allocated %u",
inode->i_ino, (unsigned long long)iblock, max_blocks,
flags, allocated);
ext4_ext_show_leaf(inode, path);
/* DIO get_block() before submit the IO, split the extent */
if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
/* flag the io_end struct that we need convert when IO done */
if (io)
io->flag = DIO_AIO_UNWRITTEN;
goto out;
}
/* DIO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
ret = ext4_convert_unwritten_extents_dio(handle, inode,
path);
goto out2;
}
/* buffered IO case */
/*
* repeat fallocate creation request
* we already have an unwritten extent
*/
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
goto map_out;
/* buffered READ or buffered write_begin() lookup */
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
* We have blocks reserved already. We
* return allocated blocks so that delalloc
* won't do block reservation for us. But
* the buffer head will be unmapped so that
* a read from the block returns 0s.
*/
set_buffer_unwritten(bh_result);
goto out1;
}
/* buffered write, writepage time, convert*/
ret = ext4_ext_convert_to_initialized(handle, inode,
path, iblock,
max_blocks);
out:
if (ret <= 0) {
err = ret;
goto out2;
} else
allocated = ret;
set_buffer_new(bh_result);
map_out:
set_buffer_mapped(bh_result);
out1:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
if (path) {
ext4_ext_drop_refs(path);
kfree(path);
}
return err ? err : allocated;
}
/*
* Block allocation/map/preallocation routine for extents based files
*
......@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
......@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
goto out;
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
if (allocated > max_blocks)
allocated = max_blocks;
/*
* We have blocks reserved already. We
* return allocated blocks so that delalloc
* won't do block reservation for us. But
* the buffer head will be unmapped so that
* a read from the block returns 0s.
*/
set_buffer_unwritten(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
goto out2;
}
ret = ext4_ext_convert_to_initialized(handle, inode,
path, iblock,
max_blocks);
if (ret <= 0) {
err = ret;
goto out2;
} else
allocated = ret;
goto outnew;
ret = ext4_ext_handle_uninitialized_extents(handle,
inode, iblock, max_blocks, path,
flags, allocated, bh_result, newblock);
return ret;
}
}
......@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
/* Mark uninitialized */
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
/*
* io_end structure was created for every async
* direct IO write to the middle of the file.
* To avoid unecessary convertion for every aio dio rewrite
* to the mid of file, here we flag the IO that is really
* need the convertion.
*
*/
if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
io->flag = DIO_AIO_UNWRITTEN;
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
......@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
......@@ -3200,6 +3511,63 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
return ret > 0 ? ret2 : ret;
}
/*
* This function convert a range of blocks to written extents
* The caller of this function will pass the start offset and the size.
* all unwritten extents within this range will be converted to
* written extents.
*
* This function is called from the direct IO end io call back
* function, to convert the fallocated extents after IO is completed.
*/
int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
loff_t len)
{
handle_t *handle;
ext4_lblk_t block;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
struct buffer_head map_bh;
unsigned int credits, blkbits = inode->i_blkbits;
block = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- block;
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
while (ret >= 0 && ret < max_blocks) {
block = block + ret;
max_blocks = max_blocks - ret;
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
map_bh.b_state = 0;
ret = ext4_get_blocks(handle, inode, block,
max_blocks, &map_bh,
EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
printk(KERN_ERR "%s: ext4_ext_get_blocks "
"returned error inode#%lu, block=%u, "
"max_blocks=%u", __func__,
inode->i_ino, block, max_blocks);
}
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret <= 0 || ret2 )
break;
}
return ret > 0 ? ret2 : ret;
}
/*
* Callback function called for each extent to gather FIEMAP information.
*/
......
......@@ -44,6 +44,8 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*
* i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
......@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
trace_ext4_sync_file(file, dentry, datasync);
ret = flush_aio_dio_completed_IO(inode);
if (ret < 0)
goto out;
/*
* data=writeback:
* The caller's filemap_fdatawrite()/wait will sync the data.
......
......@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
......@@ -1144,6 +1145,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
return 0;
}
/*
* Return the number of dirty pages in the given inode starting at
* page frame idx.
*/
static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
unsigned int max_pages)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
struct pagevec pvec;
pgoff_t num = 0;
int i, nr_pages, done = 0;
if (max_pages == 0)
return 0;
pagevec_init(&pvec, 0);
while (!done) {
index = idx;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
(pgoff_t)PAGEVEC_SIZE);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
struct buffer_head *bh, *head;
lock_page(page);
if (unlikely(page->mapping != mapping) ||
!PageDirty(page) ||
PageWriteback(page) ||
page->index != idx) {
done = 1;
unlock_page(page);
break;
}
head = page_buffers(page);
bh = head;
do {
if (!buffer_delay(bh) &&
!buffer_unwritten(bh)) {
done = 1;
break;
}
} while ((bh = bh->b_this_page) != head);
unlock_page(page);
if (done)
break;
idx++;
num++;
if (num >= max_pages)
break;
}
pagevec_release(&pvec);
}
return num;
}
/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
......@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, max_blocks,
(unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
......@@ -1796,11 +1858,11 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
vfs_dq_release_reservation_block(inode, total);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
......@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
printk(KERN_EMERG "Total free blocks count %lld\n",
ext4_count_free_blocks(inode->i_sb));
printk(KERN_EMERG "Free/Dirty block details\n");
printk(KERN_EMERG "free_blocks=%lld\n",
(long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
printk(KERN_EMERG "dirty_blocks=%lld\n",
(long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
printk(KERN_EMERG "Block reservation details\n");
printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
EXT4_I(inode)->i_reserved_data_blocks);
printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
EXT4_I(inode)->i_reserved_meta_blocks);
printk(KERN_CRIT "Total free blocks count %lld\n",
ext4_count_free_blocks(inode->i_sb));
printk(KERN_CRIT "Free/Dirty block details\n");
printk(KERN_CRIT "free_blocks=%lld\n",
(long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
printk(KERN_CRIT "dirty_blocks=%lld\n",
(long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
printk(KERN_CRIT "Block reservation details\n");
printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
EXT4_I(inode)->i_reserved_data_blocks);
printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
......@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* writepage and writepages will again try to write
* the same.
*/
printk(KERN_EMERG "%s block allocation failed for inode %lu "
"at logical offset %llu with max blocks "
"%zd with error %d\n",
__func__, mpd->inode->i_ino,
(unsigned long long)next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
ext4_msg(mpd->inode->i_sb, KERN_CRIT,
"delayed block allocation failed for inode %lu at "
"logical offset %llu with max blocks %zd with "
"error %d\n", mpd->inode->i_ino,
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_CRIT "This should not happen!! "
"Data will be lost\n");
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
......@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
int needed_blocks, ret = 0;
long desired_nr_to_write, nr_to_writebump = 0;
loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
......@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
/*
* Make sure nr_to_write is >= sbi->s_mb_stream_request
* This make sure small files blocks are allocated in
* single attempt. This ensure that small files
* get less fragmented.
*/
if (wbc->nr_to_write < sbi->s_mb_stream_request) {
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
......@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
/*
* This works around two forms of stupidity. The first is in
* the writeback code, which caps the maximum number of pages
* written to be 1024 pages. This is wrong on multiple
* levels; different architectues have a different page size,
* which changes the maximum amount of data which gets
* written. Secondly, 4 megabytes is way too small. XFS
* forces this value to be 16 megabytes by multiplying
* nr_to_write parameter by four, and then relies on its
* allocator to allocate larger extents to make them
* contiguous. Unfortunately this brings us to the second
* stupidity, which is that ext4's mballoc code only allocates
* at most 2048 blocks. So we force contiguous writes up to
* the number of dirty blocks in the inode, or
* sbi->max_writeback_mb_bump whichever is smaller.
*/
max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
if (!range_cyclic && range_whole)
desired_nr_to_write = wbc->nr_to_write * 8;
else
desired_nr_to_write = ext4_num_dirty_pages(inode, index,
max_pages);
if (desired_nr_to_write > max_pages)
desired_nr_to_write = max_pages;
if (wbc->nr_to_write < desired_nr_to_write) {
nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
wbc->nr_to_write = desired_nr_to_write;
}
mpd.wbc = wbc;
mpd.inode = mapping->host;
......@@ -2822,10 +2906,9 @@ static int ext4_da_writepages(struct address_space *mapping,
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
printk(KERN_CRIT "%s: jbd2_start: "
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
dump_stack();
goto out_writepages;
}
......@@ -2897,9 +2980,10 @@ static int ext4_da_writepages(struct address_space *mapping,
goto retry;
}
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
__func__, wbc->nr_to_write, ret);
ext4_msg(inode->i_sb, KERN_CRIT,
"This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
__func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
......@@ -2914,7 +2998,8 @@ static int ext4_da_writepages(struct address_space *mapping,
out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
if (wbc->nr_to_write > nr_to_writebump)
wbc->nr_to_write -= nr_to_writebump;
wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
......@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
* O_DIRECT for ext3 (or indirect map) based files
*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
......@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
......@@ -3354,6 +3441,359 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
return ret;
}
/* Maximum number of blocks we map for direct IO at once. */
static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
handle_t *handle = NULL;
int ret = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;
ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
inode->i_ino, create);
/*
* DIO VFS code passes create = 0 flag for write to
* the middle of file. It does this to avoid block
* allocation for holes, to prevent expose stale data
* out when there is parallel buffered read (which does
* not hold the i_mutex lock) while direct IO write has
* not completed. DIO request on holes finally falls back
* to buffered IO for this reason.
*
* For ext4 extent based file, since we support fallocate,
* new allocated extent as uninitialized, for holes, we
* could fallocate blocks for holes, thus parallel
* buffered IO read will zero out the page when read on
* a hole while parallel DIO write to the hole has not completed.
*
* when we come here, we know it's a direct IO write to
* to the middle of file (<i_size)
* so it's safe to override the create flag from VFS.
*/
create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
if (max_blocks > DIO_MAX_BLOCKS)
max_blocks = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
handle = ext4_journal_start(inode, dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
create);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
}
ext4_journal_stop(handle);
out:
return ret;
}
static void ext4_free_io_end(ext4_io_end_t *io)
{
BUG_ON(!io);
iput(io->inode);
kfree(io);
}
static void dump_aio_dio_list(struct inode * inode)
{
#ifdef EXT4_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
return;
}
ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
io1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
#endif
}
/*
* check a range of space and convert unwritten extents to written.
*/
static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
size_t size = io->size;
int ret = 0;
ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
if (list_empty(&io->list))
return ret;
if (io->flag != DIO_AIO_UNWRITTEN)
return ret;
if (offset + size <= i_size_read(inode))
ret = ext4_convert_unwritten_extents(inode, offset, size);
if (ret < 0) {
printk(KERN_EMERG "%s: failed to convert unwritten"
"extents to written extents, error is %d"
" io is still on inode %lu aio dio list\n",
__func__, ret, inode->i_ino);
return ret;
}
/* clear the DIO AIO unwritten flag */
io->flag = 0;
return ret;
}
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
static void ext4_end_aio_dio_work(struct work_struct *work)
{
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
struct inode *inode = io->inode;
int ret = 0;
mutex_lock(&inode->i_mutex);
ret = ext4_end_aio_dio_nolock(io);
if (ret >= 0) {
if (!list_empty(&io->list))
list_del_init(&io->list);
ext4_free_io_end(io);
}
mutex_unlock(&inode->i_mutex);
}
/*
* This function is called from ext4_sync_file().
*
* When AIO DIO IO is completed, the work to convert unwritten
* extents to written is queued on workqueue but may not get immediately
* scheduled. When fsync is called, we need to ensure the
* conversion is complete before fsync returns.
* The inode keeps track of a list of completed AIO from DIO path
* that might needs to do the conversion. This function walks through
* the list and convert the related unwritten extents to written.
*/
int flush_aio_dio_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
int ret = 0;
int ret2 = 0;
if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
return ret;
dump_aio_dio_list(inode);
while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
ext4_io_end_t, list);
/*
* Calling ext4_end_aio_dio_nolock() to convert completed
* IO to written.
*
* When ext4_sync_file() is called, run_queue() may already
* about to flush the work corresponding to this io structure.
* It will be upset if it founds the io structure related
* to the work-to-be schedule is freed.
*
* Thus we need to keep the io structure still valid here after
* convertion finished. The io structure has a flag to
* avoid double converting from both fsync and background work
* queue work.
*/
ret = ext4_end_aio_dio_nolock(io);
if (ret < 0)
ret2 = ret;
else
list_del_init(&io->list);
}
return (ret2 < 0) ? ret2 : 0;
}
static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
{
ext4_io_end_t *io = NULL;
io = kmalloc(sizeof(*io), GFP_NOFS);
if (io) {
igrab(inode);
io->inode = inode;
io->flag = 0;
io->offset = 0;
io->size = 0;
io->error = 0;
INIT_WORK(&io->work, ext4_end_aio_dio_work);
INIT_LIST_HEAD(&io->list);
}
return io;
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
ext_debug("ext4_end_io_dio(): io_end 0x%p"
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
return;
/* if not aio dio with unwritten extents, just free io and return */
if (io_end->flag != DIO_AIO_UNWRITTEN){
ext4_free_io_end(io_end);
iocb->private = NULL;
return;
}
io_end->offset = offset;
io_end->size = size;
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
/* Add the io_end to per-inode completed aio dio list*/
list_add_tail(&io_end->list,
&EXT4_I(io_end->inode)->i_aio_dio_complete_list);
iocb->private = NULL;
}
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
* For holes, we fallocate those blocks, mark them as unintialized
* If those blocks were preallocated, we mark sure they are splited, but
* still keep the range to write as unintialized.
*
* The unwrritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
* set up an end_io call back function, which will do the convertion
* when async direct IO completed.
*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
size_t count = iov_length(iov, nr_segs);
loff_t final_size = offset + count;
if (rw == WRITE && final_size <= inode->i_size) {
/*
* We could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as uninitialized
* to prevent paralel buffered read to expose the stale data
* before DIO complete the data IO.
*
* As to previously fallocated extents, ext4 get_block
* will just simply mark the buffer mapped but still
* keep the extents uninitialized.
*
* for non AIO case, we will convert those unwritten extents
* to written after return back from blockdev_direct_IO.
*
* for async DIO, the conversion needs to be defered when
* the IO is completed. The ext4 end_io callback function
* will be called to take care of the conversion work.
* Here for async case, we allocate an io_end structure to
* hook to the iocb.
*/
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) {
iocb->private = ext4_init_io_end(inode);
if (!iocb->private)
return -ENOMEM;
/*
* we save the io structure for current async
* direct IO, so that later ext4_get_blocks()
* could flag the io structure whether there
* is a unwritten extents needs to be converted
* when IO is completed.
*/
EXT4_I(inode)->cur_aio_dio = iocb->private;
}
ret = blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block_dio_write,
ext4_end_io_dio);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
* reference to the inode need to be dropped, when IO is
* complete, even with 0 byte write, or failed.
*
* In the successful AIO DIO case, the io_end structure will be
* desctroyed and the reference to the inode will be dropped
* after the end_io call back function is called.
*
* In the case there is 0 byte write, or error case, since
* VFS direct IO won't invoke the end_io call back function,
* we need to free the end_io structure here.
*/
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
} else if (ret > 0)
/*
* for non AIO case, since the IO is already
* completed, we could do the convertion right here
*/
ret = ext4_convert_unwritten_extents(inode,
offset, ret);
return ret;
}
/* for write the the end of file case, we fall back to old way */
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
}
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
......@@ -4551,8 +4991,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc,
int do_sync)
struct ext4_iloc *iloc)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
......@@ -4653,22 +5092,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
/*
* If we're not using a journal and we were called from
* ext4_write_inode() to sync the inode (making do_sync true),
* we can just use sync_dirty_buffer() directly to do our dirty
* work. Testing s_journal here is a bit redundant but it's
* worth it to avoid potential future trouble.
*/
if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
BUFFER_TRACE(bh, "call sync_dirty_buffer");
sync_dirty_buffer(bh);
} else {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, inode, bh);
if (!err)
err = rc;
}
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, inode, bh);
if (!err)
err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
......@@ -4736,8 +5163,16 @@ int ext4_write_inode(struct inode *inode, int wait)
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
inode, &iloc, wait);
if (wait)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
ext4_error(inode->i_sb, __func__,
"IO error syncing inode, "
"inode=%lu, block=%llu",
inode->i_ino,
(unsigned long long)iloc.bh->b_blocknr);
err = -EIO;
}
}
return err;
}
......@@ -5033,7 +5468,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
err = ext4_do_update_inode(handle, inode, iloc, 0);
err = ext4_do_update_inode(handle, inode, iloc);
put_bh(iloc->bh);
return err;
}
......@@ -5180,24 +5615,13 @@ void ext4_dirty_inode(struct inode *inode)
handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
if (!ext4_handle_valid(current_handle)) {
ext4_mark_inode_dirty(current_handle, inode);
return;
}
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
if (current_handle &&
current_handle->h_transaction != handle->h_transaction) {
/* This task has a transaction open against a different fs */
printk(KERN_EMERG "%s: transactions do not match!\n",
__func__);
} else {
jbd_debug(5, "marking dirty. outer handle=%p\n",
current_handle);
ext4_mark_inode_dirty(handle, inode);
}
jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out:
return;
......
......@@ -2096,207 +2096,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
return err;
}
#ifdef EXT4_MB_HISTORY
struct ext4_mb_proc_session {
struct ext4_mb_history *history;
struct super_block *sb;
int start;
int max;
};
static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
struct ext4_mb_history *hs,
int first)
{
if (hs == s->history + s->max)
hs = s->history;
if (!first && hs == s->history + s->start)
return NULL;
while (hs->orig.fe_len == 0) {
hs++;
if (hs == s->history + s->max)
hs = s->history;
if (hs == s->history + s->start)
return NULL;
}
return hs;
}
static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
{
struct ext4_mb_proc_session *s = seq->private;
struct ext4_mb_history *hs;
int l = *pos;
if (l == 0)
return SEQ_START_TOKEN;
hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
if (!hs)
return NULL;
while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
return hs;
}
static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
loff_t *pos)
{
struct ext4_mb_proc_session *s = seq->private;
struct ext4_mb_history *hs = v;
++*pos;
if (v == SEQ_START_TOKEN)
return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
else
return ext4_mb_history_skip_empty(s, ++hs, 0);
}
static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
{
char buf[25], buf2[25], buf3[25], *fmt;
struct ext4_mb_history *hs = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
"%-5s %-2s %-6s %-5s %-5s %-6s\n",
"pid", "inode", "original", "goal", "result", "found",
"grps", "cr", "flags", "merge", "tail", "broken");
return 0;
}
if (hs->op == EXT4_MB_HISTORY_ALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
"0x%04x %-5s %-5u %-6u\n";
sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
hs->orig.fe_start, hs->orig.fe_len,
hs->orig.fe_logical);
sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
hs->goal.fe_start, hs->goal.fe_len,
hs->goal.fe_logical);
seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
hs->found, hs->groups, hs->cr, hs->flags,
hs->merged ? "M" : "", hs->tail,
hs->buddy ? 1 << hs->buddy : 0);
} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s\n";
sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
hs->orig.fe_start, hs->orig.fe_len,
hs->orig.fe_logical);
seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len);
seq_printf(seq, "%-5u %-8u %-23s discard\n",
hs->pid, hs->ino, buf2);
} else if (hs->op == EXT4_MB_HISTORY_FREE) {
sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len);
seq_printf(seq, "%-5u %-8u %-23s free\n",
hs->pid, hs->ino, buf2);
}
return 0;
}
static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
{
}
static const struct seq_operations ext4_mb_seq_history_ops = {
.start = ext4_mb_seq_history_start,
.next = ext4_mb_seq_history_next,
.stop = ext4_mb_seq_history_stop,
.show = ext4_mb_seq_history_show,
};
static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
{
struct super_block *sb = PDE(inode)->data;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_mb_proc_session *s;
int rc;
int size;
if (unlikely(sbi->s_mb_history == NULL))
return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
s->sb = sb;
size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
s->history = kmalloc(size, GFP_KERNEL);
if (s->history == NULL) {
kfree(s);
return -ENOMEM;
}
spin_lock(&sbi->s_mb_history_lock);
memcpy(s->history, sbi->s_mb_history, size);
s->max = sbi->s_mb_history_max;
s->start = sbi->s_mb_history_cur % s->max;
spin_unlock(&sbi->s_mb_history_lock);
rc = seq_open(file, &ext4_mb_seq_history_ops);
if (rc == 0) {
struct seq_file *m = (struct seq_file *)file->private_data;
m->private = s;
} else {
kfree(s->history);
kfree(s);
}
return rc;
}
static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = (struct seq_file *)file->private_data;
struct ext4_mb_proc_session *s = seq->private;
kfree(s->history);
kfree(s);
return seq_release(inode, file);
}
static ssize_t ext4_mb_seq_history_write(struct file *file,
const char __user *buffer,
size_t count, loff_t *ppos)
{
struct seq_file *seq = (struct seq_file *)file->private_data;
struct ext4_mb_proc_session *s = seq->private;
struct super_block *sb = s->sb;
char str[32];
int value;
if (count >= sizeof(str)) {
printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
"mb_history", (int)sizeof(str));
return -EOVERFLOW;
}
if (copy_from_user(str, buffer, count))
return -EFAULT;
value = simple_strtol(str, NULL, 0);
if (value < 0)
return -ERANGE;
EXT4_SB(sb)->s_mb_history_filter = value;
return count;
}
static const struct file_operations ext4_mb_seq_history_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_history_open,
.read = seq_read,
.write = ext4_mb_seq_history_write,
.llseek = seq_lseek,
.release = ext4_mb_seq_history_release,
};
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
......@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
.release = seq_release,
};
static void ext4_mb_history_release(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_proc != NULL) {
remove_proc_entry("mb_groups", sbi->s_proc);
if (sbi->s_mb_history_max)
remove_proc_entry("mb_history", sbi->s_proc);
}
kfree(sbi->s_mb_history);
}
static void ext4_mb_history_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
if (sbi->s_proc != NULL) {
if (sbi->s_mb_history_max)
proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_history_fops, sb);
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
/* if we can't allocate history, then we simple won't use it */
}
static noinline_for_stack void
ext4_mb_store_history(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_mb_history h;
if (sbi->s_mb_history == NULL)
return;
if (!(ac->ac_op & sbi->s_mb_history_filter))
return;
h.op = ac->ac_op;
h.pid = current->pid;
h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
h.orig = ac->ac_o_ex;
h.result = ac->ac_b_ex;
h.flags = ac->ac_flags;
h.found = ac->ac_found;
h.groups = ac->ac_groups_scanned;
h.cr = ac->ac_criteria;
h.tail = ac->ac_tail;
h.buddy = ac->ac_buddy;
h.merged = 0;
if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
h.merged = 1;
h.goal = ac->ac_g_ex;
h.result = ac->ac_f_ex;
}
spin_lock(&sbi->s_mb_history_lock);
memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
sbi->s_mb_history_cur = 0;
spin_unlock(&sbi->s_mb_history_lock);
}
#else
#define ext4_mb_history_release(sb)
#define ext4_mb_history_init(sb)
#endif
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
......@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
......@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
ext4_mb_history_init(sb);
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
......@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
ext4_mb_history_release(sb);
if (sbi->s_proc)
remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
......@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_inc(&sbi->s_bal_breaks);
}
ext4_mb_store_history(ac);
if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
trace_ext4_mballoc_alloc(ac);
else
trace_ext4_mballoc_prealloc(ac);
}
/*
......@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (ac) {
ac->ac_sb = sb;
ac->ac_inode = pa->pa_inode;
ac->ac_op = EXT4_MB_HISTORY_DISCARD;
}
while (bit < end) {
......@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = next - bit;
ac->ac_b_ex.fe_logical = 0;
ext4_mb_store_history(ac);
trace_ext4_mballoc_discard(ac);
}
trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
......@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
if (ac)
ac->ac_op = EXT4_MB_HISTORY_DISCARD;
trace_ext4_mb_release_group_pa(ac, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
......@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = pa->pa_len;
ac->ac_b_ex.fe_logical = 0;
ext4_mb_store_history(ac);
trace_ext4_mballoc_discard(ac);
}
return 0;
......@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
size = max(size, isize);
if ((size == isize) &&
!ext4_fs_is_busy(sbi) &&
......@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
}
/* don't use group allocation for large files */
size = max(size, isize);
if (size >= sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
......@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
ac->ac_op = EXT4_MB_HISTORY_FREE;
ac->ac_inode = inode;
ac->ac_sb = sb;
}
......@@ -4806,7 +4527,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = count;
ext4_mb_store_history(ac);
trace_ext4_mballoc_free(ac);
}
err = ext4_mb_load_buddy(sb, block_group, &e4b);
......
......@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
#define mb_debug(n, fmt, a...)
#endif
/*
* with EXT4_MB_HISTORY mballoc stores last N allocations in memory
* and you can monitor it in /proc/fs/ext4/<dev>/mb_history
*/
#define EXT4_MB_HISTORY
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
#define EXT4_MB_HISTORY_FREE 8 /* free */
#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
EXT4_MB_HISTORY_PREALLOC)
/*
* How long mballoc can look for a best extent (in found extents)
......@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
#define MB_DEFAULT_STATS 1
#define MB_DEFAULT_STATS 0
/*
* files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
......@@ -217,22 +207,6 @@ struct ext4_allocation_context {
#define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3
struct ext4_mb_history {
struct ext4_free_extent orig; /* orig allocation */
struct ext4_free_extent goal; /* goal allocation */
struct ext4_free_extent result; /* result allocation */
unsigned pid;
unsigned ino;
__u16 found; /* how many extents have been found */
__u16 groups; /* how many groups have been scanned */
__u16 tail; /* what tail broke some buddy */
__u16 buddy; /* buddy the tail ^^^ broke */
__u16 flags;
__u8 cr:3; /* which phase the result extent was found at */
__u8 op:4;
__u8 merged:1;
};
struct ext4_buddy {
struct page *bd_buddy_page;
void *bd_buddy;
......@@ -247,13 +221,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
#ifndef EXT4_MB_HISTORY
static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
{
return;
}
#endif
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
......
......@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
retval = ext4_ext_insert_extent(handle, inode, path, &newext);
retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
......
......@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
orig_path, new_ext))
orig_path, new_ext, 0))
goto out;
}
......@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
orig_path, end_ext))
orig_path, end_ext, 0))
goto out;
}
out:
......@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
/* orig and donor should be different file */
if (orig_inode->i_ino == donor_inode->i_ino) {
ext4_debug("ext4 move extent: The argument files should not "
"be same file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
/* Ext4 move extent supports only extent based file */
if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
ext4_debug("ext4 move extent: orig file is not extents "
......@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
int block_len_in_page;
int uninit;
/* orig and donor should be different file */
if (orig_inode->i_ino == donor_inode->i_ino) {
ext4_debug("ext4 move extent: The argument files should not "
"be same file [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
/* protect orig and donor against a truncate */
ret1 = mext_inode_double_lock(orig_inode, donor_inode);
if (ret1 < 0)
......
......@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
if (!ext4_handle_valid(handle))
/* ext4_handle_valid() assumes a valid handle_t pointer */
if (handle && !ext4_handle_valid(handle))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
......
......@@ -50,13 +50,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
static int default_mb_history_length = 1000;
module_param_named(default_mb_history_length, default_mb_history_length,
int, 0644);
MODULE_PARM_DESC(default_mb_history_length,
"Default number of entries saved for mb_history");
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
......@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
handle_t *handle = current->journal_info;
unsigned long ref_cnt = (unsigned long)handle;
BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
ref_cnt++;
handle = (handle_t *)ref_cnt;
current->journal_info = handle;
return handle;
}
/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
unsigned long ref_cnt = (unsigned long)handle;
BUG_ON(ref_cnt == 0);
ref_cnt--;
handle = (handle_t *)ref_cnt;
current->journal_info = handle;
}
/*
* Wrappers for jbd2_journal_start/end.
*
......@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
}
return jbd2_journal_start(journal, nblocks);
}
/*
* We're not journaling, return the appropriate indication.
*/
current->journal_info = EXT4_NOJOURNAL_HANDLE;
return current->journal_info;
return ext4_get_nojournal();
}
/*
......@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int rc;
if (!ext4_handle_valid(handle)) {
/*
* Do this here since we don't call jbd2_journal_stop() in
* no-journal mode.
*/
current->journal_info = NULL;
ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
......@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
flush_workqueue(sbi->dio_unwritten_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
......@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
ei->cur_aio_dio = NULL;
return &ei->vfs_inode;
}
......@@ -1052,7 +1072,7 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
......@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
{Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
......@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
case Opt_mb_history_length:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
sbi->s_mb_history_max = option;
break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
......@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
if (EXT4_SB(sb)->s_journal) {
ext4_msg(sb, KERN_INFO, "%s journal on %s",
EXT4_SB(sb)->s_journal->j_inode ? "internal" :
"external", EXT4_SB(sb)->s_journal->j_devname);
} else {
ext4_msg(sb, KERN_INFO, "no journal");
}
return res;
}
......@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
......@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
NULL,
};
......@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
......@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
/*
* set up enough so that it can read an inode
......@@ -2798,6 +2805,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
clear_opt(sbi->s_mount_opt, NOBH);
}
}
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) {
printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
goto failed_mount_wq;
}
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
......@@ -2849,12 +2862,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available");
}
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
if (test_opt(sb, DELALLOC) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
} else if (test_opt(sb, DELALLOC))
ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
}
err = ext4_setup_system_zone(sb);
if (err) {
......@@ -2910,6 +2923,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
......@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
if (journal->j_flags & JBD2_BARRIER)
ext4_msg(sb, KERN_INFO, "barriers enabled");
else
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
......@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
flush_workqueue(sbi->dio_unwritten_wq);
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
......
......@@ -643,6 +643,7 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
......@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
jiffies);
trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
kfree(transaction);
......
......@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (commit_transaction->t_synchronous_commit)
write_op = WRITE_SYNC_PLUG;
trace_jbd2_commit_locking(journal, commit_transaction);
stats.u.run.rs_wait = commit_transaction->t_max_wait;
stats.u.run.rs_locked = jiffies;
stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.u.run.rs_locked);
stats.run.rs_wait = commit_transaction->t_max_wait;
stats.run.rs_locked = jiffies;
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
......@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_switch_revoke_table(journal);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.u.run.rs_flushing = jiffies;
stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
stats.u.run.rs_flushing);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
stats.run.rs_flushing);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
......@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_unlock(&journal->j_state_lock);
trace_jbd2_commit_logging(journal, commit_transaction);
stats.u.run.rs_logging = jiffies;
stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
stats.u.run.rs_logging);
stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
stats.u.run.rs_blocks_logged = 0;
stats.run.rs_logging = jiffies;
stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
stats.run.rs_logging);
stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
......@@ -695,7 +695,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
submit_bh(write_op, bh);
}
cond_resched();
stats.u.run.rs_blocks_logged += bufs;
stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
......@@ -988,33 +988,30 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_start = jiffies;
stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
commit_transaction->t_start);
stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
commit_transaction->t_start);
/*
* File the transaction for history
* File the transaction statistics
*/
stats.ts_type = JBD2_STATS_RUN;
stats.ts_tid = commit_transaction->t_tid;
stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
spin_lock(&journal->j_history_lock);
memcpy(journal->j_history + journal->j_history_cur, &stats,
sizeof(stats));
if (++journal->j_history_cur == journal->j_history_max)
journal->j_history_cur = 0;
stats.run.rs_handle_count = commit_transaction->t_handle_count;
trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
commit_transaction->t_tid, &stats.run);
/*
* Calculate overall stats
*/
spin_lock(&journal->j_history_lock);
journal->j_stats.ts_tid++;
journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
journal->j_stats.run.rs_wait += stats.run.rs_wait;
journal->j_stats.run.rs_running += stats.run.rs_running;
journal->j_stats.run.rs_locked += stats.run.rs_locked;
journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
journal->j_stats.run.rs_logging += stats.run.rs_logging;
journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
commit_transaction->t_state = T_FINISHED;
......
......@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
"commit interval %ld seconds\n", current->pid,
journal->j_devname, journal->j_commit_interval / HZ);
/*
* And now, wait forever for commit wakeup events.
*/
......@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
{
struct task_struct *t;
t = kthread_run(kjournald2, journal, "kjournald2");
t = kthread_run(kjournald2, journal, "jbd2/%s",
journal->j_devname);
if (IS_ERR(t))
return PTR_ERR(t);
......@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
int max;
};
static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
struct transaction_stats_s *ts,
int first)
{
if (ts == s->stats + s->max)
ts = s->stats;
if (!first && ts == s->stats + s->start)
return NULL;
while (ts->ts_type == 0) {
ts++;
if (ts == s->stats + s->max)
ts = s->stats;
if (ts == s->stats + s->start)
return NULL;
}
return ts;
}
static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
{
struct jbd2_stats_proc_session *s = seq->private;
struct transaction_stats_s *ts;
int l = *pos;
if (l == 0)
return SEQ_START_TOKEN;
ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
if (!ts)
return NULL;
l--;
while (l) {
ts = jbd2_history_skip_empty(s, ++ts, 0);
if (!ts)
break;
l--;
}
return ts;
}
static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct jbd2_stats_proc_session *s = seq->private;
struct transaction_stats_s *ts = v;
++*pos;
if (v == SEQ_START_TOKEN)
return jbd2_history_skip_empty(s, s->stats + s->start, 1);
else
return jbd2_history_skip_empty(s, ++ts, 0);
}
static int jbd2_seq_history_show(struct seq_file *seq, void *v)
{
struct transaction_stats_s *ts = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
"%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
"wait", "run", "lock", "flush", "log", "hndls",
"block", "inlog", "ctime", "write", "drop",
"close");
return 0;
}
if (ts->ts_type == JBD2_STATS_RUN)
seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
"%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
jiffies_to_msecs(ts->u.run.rs_wait),
jiffies_to_msecs(ts->u.run.rs_running),
jiffies_to_msecs(ts->u.run.rs_locked),
jiffies_to_msecs(ts->u.run.rs_flushing),
jiffies_to_msecs(ts->u.run.rs_logging),
ts->u.run.rs_handle_count,
ts->u.run.rs_blocks,
ts->u.run.rs_blocks_logged);
else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
"C", ts->ts_tid, " ",
jiffies_to_msecs(ts->u.chp.cs_chp_time),
ts->u.chp.cs_written, ts->u.chp.cs_dropped,
ts->u.chp.cs_forced_to_close);
else
J_ASSERT(0);
return 0;
}
static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
{
}
static const struct seq_operations jbd2_seq_history_ops = {
.start = jbd2_seq_history_start,
.next = jbd2_seq_history_next,
.stop = jbd2_seq_history_stop,
.show = jbd2_seq_history_show,
};
static int jbd2_seq_history_open(struct inode *inode, struct file *file)
{
journal_t *journal = PDE(inode)->data;
struct jbd2_stats_proc_session *s;
int rc, size;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
size = sizeof(struct transaction_stats_s) * journal->j_history_max;
s->stats = kmalloc(size, GFP_KERNEL);
if (s->stats == NULL) {
kfree(s);
return -ENOMEM;
}
spin_lock(&journal->j_history_lock);
memcpy(s->stats, journal->j_history, size);
s->max = journal->j_history_max;
s->start = journal->j_history_cur % s->max;
spin_unlock(&journal->j_history_lock);
rc = seq_open(file, &jbd2_seq_history_ops);
if (rc == 0) {
struct seq_file *m = file->private_data;
m->private = s;
} else {
kfree(s->stats);
kfree(s);
}
return rc;
}
static int jbd2_seq_history_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct jbd2_stats_proc_session *s = seq->private;
kfree(s->stats);
kfree(s);
return seq_release(inode, file);
}
static struct file_operations jbd2_seq_history_fops = {
.owner = THIS_MODULE,
.open = jbd2_seq_history_open,
.read = seq_read,
.llseek = seq_lseek,
.release = jbd2_seq_history_release,
};
static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
......@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
seq_printf(seq, "%lu transaction, each upto %u blocks\n",
seq_printf(seq, "%lu transaction, each up to %u blocks\n",
s->stats->ts_tid,
s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
seq_printf(seq, " %ums running transaction\n",
jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
seq_printf(seq, " %ums flushing data (in ordered mode)\n",
jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
seq_printf(seq, " %ums logging transaction\n",
jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
seq_printf(seq, " %lluus average transaction commit time\n",
div_u64(s->journal->j_average_commit_time, 1000));
seq_printf(seq, " %lu handles per transaction\n",
s->stats->u.run.rs_handle_count / s->stats->ts_tid);
s->stats->run.rs_handle_count / s->stats->ts_tid);
seq_printf(seq, " %lu blocks per transaction\n",
s->stats->u.run.rs_blocks / s->stats->ts_tid);
s->stats->run.rs_blocks / s->stats->ts_tid);
seq_printf(seq, " %lu logged blocks per transaction\n",
s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
s->stats->run.rs_blocks_logged / s->stats->ts_tid);
return 0;
}
......@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
{
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
proc_create_data("history", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_history_fops, journal);
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_info_fops, journal);
}
......@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
static void jbd2_stats_proc_exit(journal_t *journal)
{
remove_proc_entry("info", journal->j_proc_entry);
remove_proc_entry("history", journal->j_proc_entry);
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
static void journal_init_stats(journal_t *journal)
{
int size;
if (!proc_jbd2_stats)
return;
journal->j_history_max = 100;
size = sizeof(struct transaction_stats_s) * journal->j_history_max;
journal->j_history = kzalloc(size, GFP_KERNEL);
if (!journal->j_history) {
journal->j_history_max = 0;
return;
}
spin_lock_init(&journal->j_history_lock);
}
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
......@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
goto fail;
}
journal_init_stats(journal);
spin_lock_init(&journal->j_history_lock);
return journal;
fail:
......@@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
while ((p = strchr(p, '/')))
*p = '!';
p = journal->j_devname + strlen(journal->j_devname);
sprintf(p, ":%lu", journal->j_inode->i_ino);
sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
journal, inode->i_sb->s_id, inode->i_ino,
......
......@@ -464,9 +464,9 @@ struct handle_s
*/
struct transaction_chp_stats_s {
unsigned long cs_chp_time;
unsigned long cs_forced_to_close;
unsigned long cs_written;
unsigned long cs_dropped;
__u32 cs_forced_to_close;
__u32 cs_written;
__u32 cs_dropped;
};
/* The transaction_t type is the guts of the journaling mechanism. It
......@@ -668,23 +668,16 @@ struct transaction_run_stats_s {
unsigned long rs_flushing;
unsigned long rs_logging;
unsigned long rs_handle_count;
unsigned long rs_blocks;
unsigned long rs_blocks_logged;
__u32 rs_handle_count;
__u32 rs_blocks;
__u32 rs_blocks_logged;
};
struct transaction_stats_s {
int ts_type;
unsigned long ts_tid;
union {
struct transaction_run_stats_s run;
struct transaction_chp_stats_s chp;
} u;
struct transaction_run_stats_s run;
};
#define JBD2_STATS_RUN 1
#define JBD2_STATS_CHECKPOINT 2
static inline unsigned long
jbd2_time_diff(unsigned long start, unsigned long end)
{
......@@ -988,12 +981,6 @@ struct journal_s
/*
* Journal statistics
*/
struct transaction_stats_s *j_history;
int j_history_max;
int j_history_cur;
/*
* Protect the transactions statistics history
*/
spinlock_t j_history_lock;
struct proc_dir_entry *j_proc_entry;
struct transaction_stats_s j_stats;
......
......@@ -11,6 +11,7 @@ struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
......@@ -236,6 +237,7 @@ TRACE_EVENT(ext4_da_writepages,
__field( char, for_kupdate )
__field( char, for_reclaim )
__field( char, range_cyclic )
__field( pgoff_t, writeback_index )
),
TP_fast_assign(
......@@ -249,15 +251,17 @@ TRACE_EVENT(ext4_da_writepages,
__entry->for_kupdate = wbc->for_kupdate;
__entry->for_reclaim = wbc->for_reclaim;
__entry->range_cyclic = wbc->range_cyclic;
__entry->writeback_index = inode->i_mapping->writeback_index;
),
TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d",
TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
jbd2_dev_to_name(__entry->dev),
(unsigned long) __entry->ino, __entry->nr_to_write,
__entry->pages_skipped, __entry->range_start,
__entry->range_end, __entry->nonblocking,
__entry->for_kupdate, __entry->for_reclaim,
__entry->range_cyclic)
__entry->range_cyclic,
(unsigned long) __entry->writeback_index)
);
TRACE_EVENT(ext4_da_write_pages,
......@@ -309,6 +313,7 @@ TRACE_EVENT(ext4_da_writepages_result,
__field( char, encountered_congestion )
__field( char, more_io )
__field( char, no_nrwrite_index_update )
__field( pgoff_t, writeback_index )
),
TP_fast_assign(
......@@ -320,14 +325,16 @@ TRACE_EVENT(ext4_da_writepages_result,
__entry->encountered_congestion = wbc->encountered_congestion;
__entry->more_io = wbc->more_io;
__entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
__entry->writeback_index = inode->i_mapping->writeback_index;
),
TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
jbd2_dev_to_name(__entry->dev),
(unsigned long) __entry->ino, __entry->ret,
__entry->pages_written, __entry->pages_skipped,
__entry->encountered_congestion, __entry->more_io,
__entry->no_nrwrite_index_update)
__entry->no_nrwrite_index_update,
(unsigned long) __entry->writeback_index)
);
TRACE_EVENT(ext4_da_write_begin,
......@@ -737,6 +744,169 @@ TRACE_EVENT(ext4_alloc_da_blocks,
__entry->data_blocks, __entry->meta_blocks)
);
TRACE_EVENT(ext4_mballoc_alloc,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u16, found )
__field( __u16, groups )
__field( __u16, buddy )
__field( __u16, flags )
__field( __u16, tail )
__field( __u8, cr )
__field( __u32, orig_logical )
__field( int, orig_start )
__field( __u32, orig_group )
__field( int, orig_len )
__field( __u32, goal_logical )
__field( int, goal_start )
__field( __u32, goal_group )
__field( int, goal_len )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->found = ac->ac_found;
__entry->flags = ac->ac_flags;
__entry->groups = ac->ac_groups_scanned;
__entry->buddy = ac->ac_buddy;
__entry->tail = ac->ac_tail;
__entry->cr = ac->ac_criteria;
__entry->orig_logical = ac->ac_o_ex.fe_logical;
__entry->orig_start = ac->ac_o_ex.fe_start;
__entry->orig_group = ac->ac_o_ex.fe_group;
__entry->orig_len = ac->ac_o_ex.fe_len;
__entry->goal_logical = ac->ac_g_ex.fe_logical;
__entry->goal_start = ac->ac_g_ex.fe_start;
__entry->goal_group = ac->ac_g_ex.fe_group;
__entry->goal_len = ac->ac_g_ex.fe_len;
__entry->result_logical = ac->ac_f_ex.fe_logical;
__entry->result_start = ac->ac_f_ex.fe_start;
__entry->result_group = ac->ac_f_ex.fe_group;
__entry->result_len = ac->ac_f_ex.fe_len;
),
TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
"result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
"tail %u broken %u",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->orig_group, __entry->orig_start,
__entry->orig_len, __entry->orig_logical,
__entry->goal_group, __entry->goal_start,
__entry->goal_len, __entry->goal_logical,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical,
__entry->found, __entry->groups, __entry->cr,
__entry->flags, __entry->tail,
__entry->buddy ? 1 << __entry->buddy : 0)
);
TRACE_EVENT(ext4_mballoc_prealloc,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, orig_logical )
__field( int, orig_start )
__field( __u32, orig_group )
__field( int, orig_len )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->orig_logical = ac->ac_o_ex.fe_logical;
__entry->orig_start = ac->ac_o_ex.fe_start;
__entry->orig_group = ac->ac_o_ex.fe_group;
__entry->orig_len = ac->ac_o_ex.fe_len;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->orig_group, __entry->orig_start,
__entry->orig_len, __entry->orig_logical,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
TRACE_EVENT(ext4_mballoc_discard,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
TRACE_EVENT(ext4_mballoc_free,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
#endif /* _TRACE_EXT4_H */
/* This part must be outside protection */
......
......@@ -7,6 +7,9 @@
#include <linux/jbd2.h>
#include <linux/tracepoint.h>
struct transaction_chp_stats_s;
struct transaction_run_stats_s;
TRACE_EVENT(jbd2_checkpoint,
TP_PROTO(journal_t *journal, int result),
......@@ -162,6 +165,81 @@ TRACE_EVENT(jbd2_submit_inode_data,
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
);
TRACE_EVENT(jbd2_run_stats,
TP_PROTO(dev_t dev, unsigned long tid,
struct transaction_run_stats_s *stats),
TP_ARGS(dev, tid, stats),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( unsigned long, tid )
__field( unsigned long, wait )
__field( unsigned long, running )
__field( unsigned long, locked )
__field( unsigned long, flushing )
__field( unsigned long, logging )
__field( __u32, handle_count )
__field( __u32, blocks )
__field( __u32, blocks_logged )
),
TP_fast_assign(
__entry->dev = dev;
__entry->tid = tid;
__entry->wait = stats->rs_wait;
__entry->running = stats->rs_running;
__entry->locked = stats->rs_locked;
__entry->flushing = stats->rs_flushing;
__entry->logging = stats->rs_logging;
__entry->handle_count = stats->rs_handle_count;
__entry->blocks = stats->rs_blocks;
__entry->blocks_logged = stats->rs_blocks_logged;
),
TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u "
"logging %u handle_count %u blocks %u blocks_logged %u",
jbd2_dev_to_name(__entry->dev), __entry->tid,
jiffies_to_msecs(__entry->wait),
jiffies_to_msecs(__entry->running),
jiffies_to_msecs(__entry->locked),
jiffies_to_msecs(__entry->flushing),
jiffies_to_msecs(__entry->logging),
__entry->handle_count, __entry->blocks,
__entry->blocks_logged)
);
TRACE_EVENT(jbd2_checkpoint_stats,
TP_PROTO(dev_t dev, unsigned long tid,
struct transaction_chp_stats_s *stats),
TP_ARGS(dev, tid, stats),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( unsigned long, tid )
__field( unsigned long, chp_time )
__field( __u32, forced_to_close )
__field( __u32, written )
__field( __u32, dropped )
),
TP_fast_assign(
__entry->dev = dev;
__entry->tid = tid;
__entry->chp_time = stats->cs_chp_time;
__entry->forced_to_close= stats->cs_forced_to_close;
__entry->written = stats->cs_written;
__entry->dropped = stats->cs_dropped;
),
TP_printk("dev %s tid %lu chp_time %u forced_to_close %u "
"written %u dropped %u",
jbd2_dev_to_name(__entry->dev), __entry->tid,
jiffies_to_msecs(__entry->chp_time),
__entry->forced_to_close, __entry->written, __entry->dropped)
);
#endif /* _TRACE_JBD2_H */
/* This part must be outside protection */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment