Commit ea6db58f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (27 commits)
  ocfs2: Cache extent records
  ocfs2: Remember rw lock level during direct io
  ocfs2: Fix up i_blocks calculation to know about holes
  ocfs2: Fix extent lookup to return true size of holes
  ocfs2: Read from an unwritten extent returns zeros
  ocfs2: make room for unwritten extents flag
  ocfs2: Use own splice write actor
  ocfs2: Use do_sync_mapping_range() in ocfs2_zero_tail_for_truncate()
  [PATCH] Turn do_sync_file_range() into do_sync_mapping_range()
  ocfs2: zero tail of sparse files on truncate
  ocfs2: Teach ocfs2_get_block() about holes
  ocfs2: remove ocfs2_prepare_write() and ocfs2_commit_write()
  ocfs2: teach ocfs2_file_aio_write() about sparse files
  ocfs2: Turn off shared writeable mmap for local files systems with holes.
  ocfs2: abstract out allocation locking
  ocfs2: teach extend/truncate about sparse files
  ocfs2: temporarily remove extent map caching
  ocfs2: sparse b-tree support
  ocfs2: small cleanup of ocfs2_request_delete()
  ocfs2: remove unused code
  ...
parents c58b8e4a 83418978
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC #define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h> #include <cluster/masklog.h>
...@@ -34,6 +35,7 @@ ...@@ -34,6 +35,7 @@
#include "ocfs2.h" #include "ocfs2.h"
#include "alloc.h" #include "alloc.h"
#include "aops.h"
#include "dlmglue.h" #include "dlmglue.h"
#include "extent_map.h" #include "extent_map.h"
#include "inode.h" #include "inode.h"
...@@ -47,62 +49,242 @@ ...@@ -47,62 +49,242 @@
#include "buffer_head_io.h" #include "buffer_head_io.h"
static int ocfs2_extent_contig(struct inode *inode, static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
struct ocfs2_extent_rec *ext,
u64 blkno);
static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, /*
handle_t *handle, * Structures which describe a path through a btree, and functions to
struct inode *inode, * manipulate them.
int wanted, *
struct ocfs2_alloc_context *meta_ac, * The idea here is to be as generic as possible with the tree
struct buffer_head *bhs[]); * manipulation code.
*/
struct ocfs2_path_item {
struct buffer_head *bh;
struct ocfs2_extent_list *el;
};
static int ocfs2_add_branch(struct ocfs2_super *osb, #define OCFS2_MAX_PATH_DEPTH 5
handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
struct buffer_head *eb_bh,
struct buffer_head *last_eb_bh,
struct ocfs2_alloc_context *meta_ac);
static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct ocfs2_path {
handle_t *handle, int p_tree_depth;
struct inode *inode, struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
struct buffer_head *fe_bh, };
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh);
static int ocfs2_do_insert_extent(struct ocfs2_super *osb, #define path_root_bh(_path) ((_path)->p_node[0].bh)
handle_t *handle, #define path_root_el(_path) ((_path)->p_node[0].el)
struct inode *inode, #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
struct buffer_head *fe_bh, #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
u64 blkno, #define path_num_items(_path) ((_path)->p_tree_depth + 1)
u32 new_clusters);
static int ocfs2_find_branch_target(struct ocfs2_super *osb, /*
struct inode *inode, * Reset the actual path elements so that we can re-use the structure
struct buffer_head *fe_bh, * to build another path. Generally, this involves freeing the buffer
struct buffer_head **target_bh); * heads.
*/
static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
{
int i, start = 0, depth = 0;
struct ocfs2_path_item *node;
static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, if (keep_root)
struct inode *inode, start = 1;
struct ocfs2_dinode *fe,
unsigned int new_i_clusters,
struct buffer_head *old_last_eb,
struct buffer_head **new_last_eb);
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); for(i = start; i < path_num_items(path); i++) {
node = &path->p_node[i];
brelse(node->bh);
node->bh = NULL;
node->el = NULL;
}
/*
* Tree depth may change during truncate, or insert. If we're
* keeping the root extent list, then make sure that our path
* structure reflects the proper depth.
*/
if (keep_root)
depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
path->p_tree_depth = depth;
}
static void ocfs2_free_path(struct ocfs2_path *path)
{
if (path) {
ocfs2_reinit_path(path, 0);
kfree(path);
}
}
/*
* Make the *dest path the same as src and re-initialize src path to
* have a root only.
*/
static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
{
int i;
BUG_ON(path_root_bh(dest) != path_root_bh(src));
for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
brelse(dest->p_node[i].bh);
dest->p_node[i].bh = src->p_node[i].bh;
dest->p_node[i].el = src->p_node[i].el;
src->p_node[i].bh = NULL;
src->p_node[i].el = NULL;
}
}
/*
* Insert an extent block at given index.
*
* This will not take an additional reference on eb_bh.
*/
static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
struct buffer_head *eb_bh)
{
struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
/*
* Right now, no root bh is an extent block, so this helps
* catch code errors with dinode trees. The assertion can be
* safely removed if we ever need to insert extent block
* structures at the root.
*/
BUG_ON(index == 0);
static int ocfs2_extent_contig(struct inode *inode, path->p_node[index].bh = eb_bh;
path->p_node[index].el = &eb->h_list;
}
static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
struct ocfs2_extent_list *root_el)
{
struct ocfs2_path *path;
BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
path = kzalloc(sizeof(*path), GFP_NOFS);
if (path) {
path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
get_bh(root_bh);
path_root_bh(path) = root_bh;
path_root_el(path) = root_el;
}
return path;
}
/*
* Allocate and initialize a new path based on a disk inode tree.
*/
static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
{
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_extent_list *el = &di->id2.i_list;
return ocfs2_new_path(di_bh, el);
}
/*
* Convenience function to journal all components in a path.
*/
static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
struct ocfs2_path *path)
{
int i, ret = 0;
if (!path)
goto out;
for(i = 0; i < path_num_items(path); i++) {
ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
}
out:
return ret;
}
enum ocfs2_contig_type {
CONTIG_NONE = 0,
CONTIG_LEFT,
CONTIG_RIGHT
};
/*
* NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
* ocfs2_extent_contig only work properly against leaf nodes!
*/
static int ocfs2_block_extent_contig(struct super_block *sb,
struct ocfs2_extent_rec *ext, struct ocfs2_extent_rec *ext,
u64 blkno) u64 blkno)
{ {
return blkno == (le64_to_cpu(ext->e_blkno) + u64 blk_end = le64_to_cpu(ext->e_blkno);
ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(ext->e_clusters))); blk_end += ocfs2_clusters_to_blocks(sb,
le16_to_cpu(ext->e_leaf_clusters));
return blkno == blk_end;
}
static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
struct ocfs2_extent_rec *right)
{
u32 left_range;
left_range = le32_to_cpu(left->e_cpos) +
le16_to_cpu(left->e_leaf_clusters);
return (left_range == le32_to_cpu(right->e_cpos));
}
static enum ocfs2_contig_type
ocfs2_extent_contig(struct inode *inode,
struct ocfs2_extent_rec *ext,
struct ocfs2_extent_rec *insert_rec)
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
if (ocfs2_extents_adjacent(ext, insert_rec) &&
ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
return CONTIG_RIGHT;
blkno = le64_to_cpu(ext->e_blkno);
if (ocfs2_extents_adjacent(insert_rec, ext) &&
ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
return CONTIG_LEFT;
return CONTIG_NONE;
} }
/*
* NOTE: We can have pretty much any combination of contiguousness and
* appending.
*
* The usefulness of APPEND_TAIL is more in that it lets us know that
* we'll have to update the path to that leaf.
*/
enum ocfs2_append_type {
APPEND_NONE = 0,
APPEND_TAIL,
};
struct ocfs2_insert_type {
enum ocfs2_append_type ins_appending;
enum ocfs2_contig_type ins_contig;
int ins_contig_index;
int ins_free_records;
int ins_tree_depth;
};
/* /*
* How many free extents have we got before we need more meta data? * How many free extents have we got before we need more meta data?
*/ */
...@@ -241,6 +423,28 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, ...@@ -241,6 +423,28 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
return status; return status;
} }
/*
* Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
*
* Returns the sum of the rightmost extent rec logical offset and
* cluster count.
*
* ocfs2_add_branch() uses this to determine what logical cluster
* value should be populated into the leftmost new branch records.
*
* ocfs2_shift_tree_depth() uses this to determine the # clusters
* value for the new topmost tree record.
*/
static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
{
int i;
i = le16_to_cpu(el->l_next_free_rec) - 1;
return le32_to_cpu(el->l_recs[i].e_cpos) +
ocfs2_rec_clusters(el, &el->l_recs[i]);
}
/* /*
* Add an entire tree branch to our inode. eb_bh is the extent block * Add an entire tree branch to our inode. eb_bh is the extent block
* to start at, if we don't want to start the branch at the dinode * to start at, if we don't want to start the branch at the dinode
...@@ -250,7 +454,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, ...@@ -250,7 +454,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
* for the new last extent block. * for the new last extent block.
* *
* the new branch will be 'empty' in the sense that every block will * the new branch will be 'empty' in the sense that every block will
* contain a single record with e_clusters == 0. * contain a single record with cluster count == 0.
*/ */
static int ocfs2_add_branch(struct ocfs2_super *osb, static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle, handle_t *handle,
...@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, ...@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
struct ocfs2_extent_block *eb; struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
u32 new_cpos;
mlog_entry_void(); mlog_entry_void();
...@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, ...@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail; goto bail;
} }
eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
* linked with the rest of the tree. * linked with the rest of the tree.
* conversly, new_eb_bhs[0] is the new bottommost leaf. * conversly, new_eb_bhs[0] is the new bottommost leaf.
...@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, ...@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
eb->h_next_leaf_blk = 0; eb->h_next_leaf_blk = 0;
eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_tree_depth = cpu_to_le16(i);
eb_el->l_next_free_rec = cpu_to_le16(1); eb_el->l_next_free_rec = cpu_to_le16(1);
eb_el->l_recs[0].e_cpos = fe->i_clusters; /*
* This actually counts as an empty extent as
* c_clusters == 0
*/
eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
eb_el->l_recs[0].e_clusters = cpu_to_le32(0); /*
* eb_el isn't always an interior node, but even leaf
* nodes want a zero'd flags and reserved field so
* this gets the whole 32 bits regardless of use.
*/
eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
if (!eb_el->l_tree_depth) if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno); new_last_eb_blk = le64_to_cpu(eb->h_blkno);
...@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, ...@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* either be on the fe, or the extent block passed in. */ * either be on the fe, or the extent block passed in. */
i = le16_to_cpu(el->l_next_free_rec); i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
el->l_recs[i].e_cpos = fe->i_clusters; el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
el->l_recs[i].e_clusters = 0; el->l_recs[i].e_int_clusters = 0;
le16_add_cpu(&el->l_next_free_rec, 1); le16_add_cpu(&el->l_next_free_rec, 1);
/* fe needs a new last extent block pointer, as does the /* fe needs a new last extent block pointer, as does the
...@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, ...@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
struct buffer_head **ret_new_eb_bh) struct buffer_head **ret_new_eb_bh)
{ {
int status, i; int status, i;
u32 new_clusters;
struct buffer_head *new_eb_bh = NULL; struct buffer_head *new_eb_bh = NULL;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb; struct ocfs2_extent_block *eb;
...@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, ...@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
/* copy the fe data into the new extent block */ /* copy the fe data into the new extent block */
eb_el->l_tree_depth = fe_el->l_tree_depth; eb_el->l_tree_depth = fe_el->l_tree_depth;
eb_el->l_next_free_rec = fe_el->l_next_free_rec; eb_el->l_next_free_rec = fe_el->l_next_free_rec;
for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; eb_el->l_recs[i] = fe_el->l_recs[i];
eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
}
status = ocfs2_journal_dirty(handle, new_eb_bh); status = ocfs2_journal_dirty(handle, new_eb_bh);
if (status < 0) { if (status < 0) {
...@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, ...@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail; goto bail;
} }
new_clusters = ocfs2_sum_rightmost_rec(eb_el);
/* update fe now */ /* update fe now */
le16_add_cpu(&fe_el->l_tree_depth, 1); le16_add_cpu(&fe_el->l_tree_depth, 1);
fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_cpos = 0;
fe_el->l_recs[0].e_blkno = eb->h_blkno; fe_el->l_recs[0].e_blkno = eb->h_blkno;
fe_el->l_recs[0].e_clusters = fe->i_clusters; fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
fe_el->l_recs[i].e_cpos = 0; memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
fe_el->l_recs[i].e_clusters = 0;
fe_el->l_recs[i].e_blkno = 0;
}
fe_el->l_next_free_rec = cpu_to_le16(1); fe_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk /* If this is our 1st tree depth shift, then last_eb_blk
...@@ -514,199 +728,6 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, ...@@ -514,199 +728,6 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
return status; return status;
} }
/*
* Expects the tree to already have room in the rightmost leaf for the
* extent. Updates all the extent blocks (and the dinode) on the way
* down.
*/
static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 start_blk,
u32 new_clusters)
{
int status, i, num_bhs = 0;
u64 next_blkno;
u16 next_free;
struct buffer_head **eb_bhs = NULL;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
mlog_entry_void();
status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) fe_bh->b_data;
el = &fe->id2.i_list;
if (el->l_tree_depth) {
/* This is another operation where we want to be
* careful about our tree updates. An error here means
* none of the previous changes we made should roll
* forward. As a result, we have to record the buffers
* for this part of the tree in an array and reserve a
* journal write to them before making any changes. */
num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!eb_bhs) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
i = 0;
while(el->l_tree_depth) {
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(inode->i_sb,
"Dinode %llu has a bad extent list",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
status = -EIO;
goto bail;
}
next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
BUG_ON(i >= num_bhs);
status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
OCFS2_BH_CACHED, inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
eb);
status = -EIO;
goto bail;
}
status = ocfs2_journal_access(handle, inode, eb_bhs[i],
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
el = &eb->h_list;
i++;
/* When we leave this loop, eb_bhs[num_bhs - 1] will
* hold the bottom-most leaf extent block. */
}
BUG_ON(el->l_tree_depth);
el = &fe->id2.i_list;
/* If we have tree depth, then the fe update is
* trivial, and we want to switch el out for the
* bottom-most leaf in order to update it with the
* actual extent data below. */
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(inode->i_sb,
"Dinode %llu has a bad extent list",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
status = -EIO;
goto bail;
}
le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
new_clusters);
/* (num_bhs - 1) to avoid the leaf */
for(i = 0; i < (num_bhs - 1); i++) {
eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
el = &eb->h_list;
/* finally, make our actual change to the
* intermediate extent blocks. */
next_free = le16_to_cpu(el->l_next_free_rec);
le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
new_clusters);
status = ocfs2_journal_dirty(handle, eb_bhs[i]);
if (status < 0)
mlog_errno(status);
}
BUG_ON(i != (num_bhs - 1));
/* note that the leaf block wasn't touched in
* the loop above */
eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
el = &eb->h_list;
BUG_ON(el->l_tree_depth);
}
/* yay, we can finally add the actual extent now! */
i = le16_to_cpu(el->l_next_free_rec) - 1;
if (le16_to_cpu(el->l_next_free_rec) &&
ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
} else if (le16_to_cpu(el->l_next_free_rec) &&
(le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
/* having an empty extent at eof is legal. */
if (el->l_recs[i].e_cpos != fe->i_clusters) {
ocfs2_error(inode->i_sb,
"Dinode %llu trailing extent is bad: "
"cpos (%u) != number of clusters (%u)",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le32_to_cpu(el->l_recs[i].e_cpos),
le32_to_cpu(fe->i_clusters));
status = -EIO;
goto bail;
}
el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
} else {
/* No contiguous record, or no empty record at eof, so
* we add a new one. */
BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
le16_to_cpu(el->l_count));
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
el->l_recs[i].e_cpos = fe->i_clusters;
le16_add_cpu(&el->l_next_free_rec, 1);
}
/*
* extent_map errors are not fatal, so they are ignored outside
* of flushing the thing.
*/
status = ocfs2_extent_map_append(inode, &el->l_recs[i],
new_clusters);
if (status) {
mlog_errno(status);
ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
}
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0)
mlog_errno(status);
if (fe->id2.i_list.l_tree_depth) {
status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
if (status < 0)
mlog_errno(status);
}
status = 0;
bail:
if (eb_bhs) {
for (i = 0; i < num_bhs; i++)
if (eb_bhs[i])
brelse(eb_bhs[i]);
kfree(eb_bhs);
}
mlog_exit(status);
return status;
}
/* /*
* Should only be called when there is no space left in any of the * Should only be called when there is no space left in any of the
* leaf nodes. What we want to do is find the lowest tree depth * leaf nodes. What we want to do is find the lowest tree depth
...@@ -807,53 +828,1548 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, ...@@ -807,53 +828,1548 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
return status; return status;
} }
/* the caller needs to update fe->i_clusters */ /*
int ocfs2_insert_extent(struct ocfs2_super *osb, * This is only valid for leaf nodes, which are the only ones that can
handle_t *handle, * have empty extents anyway.
struct inode *inode, */
struct buffer_head *fe_bh, static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
u64 start_blk, {
u32 new_clusters, return !rec->e_leaf_clusters;
struct ocfs2_alloc_context *meta_ac) }
/*
* This function will discard the rightmost extent record.
*/
static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
{
int next_free = le16_to_cpu(el->l_next_free_rec);
int count = le16_to_cpu(el->l_count);
unsigned int num_bytes;
BUG_ON(!next_free);
/* This will cause us to go off the end of our extent list. */
BUG_ON(next_free >= count);
num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
}
static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *insert_rec)
{
int i, insert_index, next_free, has_empty, num_bytes;
u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
struct ocfs2_extent_rec *rec;
next_free = le16_to_cpu(el->l_next_free_rec);
has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
BUG_ON(!next_free);
/* The tree code before us didn't allow enough room in the leaf. */
if (el->l_next_free_rec == el->l_count && !has_empty)
BUG();
/*
* The easiest way to approach this is to just remove the
* empty extent and temporarily decrement next_free.
*/
if (has_empty) {
/*
* If next_free was 1 (only an empty extent), this
* loop won't execute, which is fine. We still want
* the decrement above to happen.
*/
for(i = 0; i < (next_free - 1); i++)
el->l_recs[i] = el->l_recs[i+1];
next_free--;
}
/*
* Figure out what the new record index should be.
*/
for(i = 0; i < next_free; i++) {
rec = &el->l_recs[i];
if (insert_cpos < le32_to_cpu(rec->e_cpos))
break;
}
insert_index = i;
mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
BUG_ON(insert_index < 0);
BUG_ON(insert_index >= le16_to_cpu(el->l_count));
BUG_ON(insert_index > next_free);
/*
* No need to memmove if we're just adding to the tail.
*/
if (insert_index != next_free) {
BUG_ON(next_free >= le16_to_cpu(el->l_count));
num_bytes = next_free - insert_index;
num_bytes *= sizeof(struct ocfs2_extent_rec);
memmove(&el->l_recs[insert_index + 1],
&el->l_recs[insert_index],
num_bytes);
}
/*
* Either we had an empty extent, and need to re-increment or
* there was no empty extent on a non full rightmost leaf node,
* in which case we still need to increment.
*/
next_free++;
el->l_next_free_rec = cpu_to_le16(next_free);
/*
* Make sure none of the math above just messed up our tree.
*/
BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
el->l_recs[insert_index] = *insert_rec;
}
/*
* Create an empty extent record .
*
* l_next_free_rec may be updated.
*
* If an empty extent already exists do nothing.
*/
static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
{
int next_free = le16_to_cpu(el->l_next_free_rec);
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
if (next_free == 0)
goto set_and_inc;
if (ocfs2_is_empty_extent(&el->l_recs[0]))
return;
mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
"Asked to create an empty extent in a full list:\n"
"count = %u, tree depth = %u",
le16_to_cpu(el->l_count),
le16_to_cpu(el->l_tree_depth));
ocfs2_shift_records_right(el);
set_and_inc:
le16_add_cpu(&el->l_next_free_rec, 1);
memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
}
/*
* For a rotation which involves two leaf nodes, the "root node" is
* the lowest level tree node which contains a path to both leafs. This
* resulting set of information can be used to form a complete "subtree"
*
* This function is passed two full paths from the dinode down to a
* pair of adjacent leaves. It's task is to figure out which path
* index contains the subtree root - this can be the root index itself
* in a worst-case rotation.
*
* The array index of the subtree root is passed back.
*/
static int ocfs2_find_subtree_root(struct inode *inode,
struct ocfs2_path *left,
struct ocfs2_path *right)
{
int i = 0;
/*
* Check that the caller passed in two paths from the same tree.
*/
BUG_ON(path_root_bh(left) != path_root_bh(right));
do {
i++;
/*
* The caller didn't pass two adjacent paths.
*/
mlog_bug_on_msg(i > left->p_tree_depth,
"Inode %lu, left depth %u, right depth %u\n"
"left leaf blk %llu, right leaf blk %llu\n",
inode->i_ino, left->p_tree_depth,
right->p_tree_depth,
(unsigned long long)path_leaf_bh(left)->b_blocknr,
(unsigned long long)path_leaf_bh(right)->b_blocknr);
} while (left->p_node[i].bh->b_blocknr ==
right->p_node[i].bh->b_blocknr);
return i - 1;
}
typedef void (path_insert_t)(void *, struct buffer_head *);
/*
* Traverse a btree path in search of cpos, starting at root_el.
*
* This code can be called with a cpos larger than the tree, in which
* case it will return the rightmost path.
*/
static int __ocfs2_find_path(struct inode *inode,
struct ocfs2_extent_list *root_el, u32 cpos,
path_insert_t *func, void *data)
{
int i, ret = 0;
u32 range;
u64 blkno;
struct buffer_head *bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
el = root_el;
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(inode->i_sb,
"Inode %llu has empty extent list at "
"depth %u\n",
(unsigned long long)oi->ip_blkno,
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
goto out;
}
for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
rec = &el->l_recs[i];
/*
* In the case that cpos is off the allocation
* tree, this should just wind up returning the
* rightmost record.
*/
range = le32_to_cpu(rec->e_cpos) +
ocfs2_rec_clusters(el, rec);
if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
break;
}
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(inode->i_sb,
"Inode %llu has bad blkno in extent list "
"at depth %u (index %d)\n",
(unsigned long long)oi->ip_blkno,
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
goto out;
}
brelse(bh);
bh = NULL;
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
&bh, OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EIO;
goto out;
}
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(inode->i_sb,
"Inode %llu has bad count in extent list "
"at block %llu (next free=%u, count=%u)\n",
(unsigned long long)oi->ip_blkno,
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
le16_to_cpu(el->l_count));
ret = -EROFS;
goto out;
}
if (func)
func(data, bh);
}
out:
/*
* Catch any trailing bh that the loop didn't handle.
*/
brelse(bh);
return ret;
}
/*
* Given an initialized path (that is, it has a valid root extent
* list), this function will traverse the btree in search of the path
* which would contain cpos.
*
* The path traveled is recorded in the path structure.
*
* Note that this will not do any comparisons on leaf node extent
* records, so it will work fine in the case that we just added a tree
* branch.
*/
struct find_path_data {
int index;
struct ocfs2_path *path;
};
static void find_path_ins(void *data, struct buffer_head *bh)
{
struct find_path_data *fp = data;
get_bh(bh);
ocfs2_path_insert_eb(fp->path, fp->index, bh);
fp->index++;
}
static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
u32 cpos)
{
struct find_path_data data;
data.index = 1;
data.path = path;
return __ocfs2_find_path(inode, path_root_el(path), cpos,
find_path_ins, &data);
}
static void find_leaf_ins(void *data, struct buffer_head *bh)
{
struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
struct ocfs2_extent_list *el = &eb->h_list;
struct buffer_head **ret = data;
/* We want to retain only the leaf block. */
if (le16_to_cpu(el->l_tree_depth) == 0) {
get_bh(bh);
*ret = bh;
}
}
/*
* Find the leaf block in the tree which would contain cpos. No
* checking of the actual leaf is done.
*
* Some paths want to call this instead of allocating a path structure
* and calling ocfs2_find_path().
*
* This function doesn't handle non btree extent lists.
*/
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh)
{
int ret;
struct buffer_head *bh = NULL;
ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
if (ret) {
mlog_errno(ret);
goto out;
}
*leaf_bh = bh;
out:
return ret;
}
/*
* Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
*
* Basically, we've moved stuff around at the bottom of the tree and
* we need to fix up the extent records above the changes to reflect
* the new changes.
*
* left_rec: the record on the left.
* left_child_el: is the child list pointed to by left_rec
* right_rec: the record to the right of left_rec
* right_child_el: is the child list pointed to by right_rec
*
* By definition, this only works on interior nodes.
*/
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
struct ocfs2_extent_list *left_child_el,
struct ocfs2_extent_rec *right_rec,
struct ocfs2_extent_list *right_child_el)
{
u32 left_clusters, right_end;
/*
* Interior nodes never have holes. Their cpos is the cpos of
* the leftmost record in their child list. Their cluster
* count covers the full theoretical range of their child list
* - the range between their cpos and the cpos of the record
* immediately to their right.
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
left_clusters -= le32_to_cpu(left_rec->e_cpos);
left_rec->e_int_clusters = cpu_to_le32(left_clusters);
/*
* Calculate the rightmost cluster count boundary before
* moving cpos - we will need to adjust clusters after
* updating e_cpos to keep the same highest cluster count.
*/
right_end = le32_to_cpu(right_rec->e_cpos);
right_end += le32_to_cpu(right_rec->e_int_clusters);
right_rec->e_cpos = left_rec->e_cpos;
le32_add_cpu(&right_rec->e_cpos, left_clusters);
right_end -= le32_to_cpu(right_rec->e_cpos);
right_rec->e_int_clusters = cpu_to_le32(right_end);
}
/*
* Adjust the adjacent root node records involved in a
* rotation. left_el_blkno is passed in as a key so that we can easily
* find it's index in the root list.
*/
static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
struct ocfs2_extent_list *left_el,
struct ocfs2_extent_list *right_el,
u64 left_el_blkno)
{
int i;
BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
le16_to_cpu(left_el->l_tree_depth));
for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
break;
}
/*
* The path walking code should have never returned a root and
* two paths which are not adjacent.
*/
BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
&root_el->l_recs[i + 1], right_el);
}
/*
* We've changed a leaf block (in right_path) and need to reflect that
* change back up the subtree.
*
* This happens in multiple places:
* - When we've moved an extent record from the left path leaf to the right
* path leaf to make room for an empty extent in the left path leaf.
* - When our insert into the right path leaf is at the leftmost edge
* and requires an update of the path immediately to it's left. This
* can occur at the end of some types of rotation and appending inserts.
*/
static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
{
int ret, i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
/*
* Update the counts and position values within all the
* interior nodes to reflect the leaf rotation we just did.
*
* The root node is handled below the loop.
*
* We begin the loop with right_el and left_el pointing to the
* leaf lists and work our way up.
*
* NOTE: within this loop, left_el and right_el always refer
* to the *child* lists.
*/
left_el = path_leaf_el(left_path);
right_el = path_leaf_el(right_path);
for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
mlog(0, "Adjust records at index %u\n", i);
/*
* One nice property of knowing that all of these
* nodes are below the root is that we only deal with
* the leftmost right node record and the rightmost
* left node record.
*/
el = left_path->p_node[i].el;
idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
left_rec = &el->l_recs[idx];
el = right_path->p_node[i].el;
right_rec = &el->l_recs[0];
ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
right_el);
ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
if (ret)
mlog_errno(ret);
ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
if (ret)
mlog_errno(ret);
/*
* Setup our list pointers now so that the current
* parents become children in the next iteration.
*/
left_el = left_path->p_node[i].el;
right_el = right_path->p_node[i].el;
}
/*
* At the root node, adjust the two adjacent records which
* begin our path to the leaves.
*/
el = left_path->p_node[subtree_index].el;
left_el = left_path->p_node[subtree_index + 1].el;
right_el = right_path->p_node[subtree_index + 1].el;
ocfs2_adjust_root_records(el, left_el, right_el,
left_path->p_node[subtree_index + 1].bh->b_blocknr);
root_bh = left_path->p_node[subtree_index].bh;
ret = ocfs2_journal_dirty(handle, root_bh);
if (ret)
mlog_errno(ret);
}
static int ocfs2_rotate_subtree_right(struct inode *inode,
handle_t *handle,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
{
int ret, i;
struct buffer_head *right_leaf_bh;
struct buffer_head *left_leaf_bh = NULL;
struct buffer_head *root_bh;
struct ocfs2_extent_list *right_el, *left_el;
struct ocfs2_extent_rec move_rec;
left_leaf_bh = path_leaf_bh(left_path);
left_el = path_leaf_el(left_path);
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(inode->i_sb,
"Inode %llu has non-full interior leaf node %llu"
"(next free = %u)",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
return -EROFS;
}
/*
* This extent block may already have an empty record, so we
* return early if so.
*/
if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
return 0;
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
ret = ocfs2_journal_access(handle, inode, root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
ret = ocfs2_journal_access(handle, inode,
right_path->p_node[i].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access(handle, inode,
left_path->p_node[i].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
}
right_leaf_bh = path_leaf_bh(right_path);
right_el = path_leaf_el(right_path);
/* This is a code error, not a disk corruption. */
mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
"because rightmost leaf block %llu is empty\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)right_leaf_bh->b_blocknr);
ocfs2_create_empty_extent(right_el);
ret = ocfs2_journal_dirty(handle, right_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
/* Do the copy now. */
i = le16_to_cpu(left_el->l_next_free_rec) - 1;
move_rec = left_el->l_recs[i];
right_el->l_recs[0] = move_rec;
/*
* Clear out the record we just copied and shift everything
* over, leaving an empty extent in the left leaf.
*
* We temporarily subtract from next_free_rec so that the
* shift will lose the tail record (which is now defunct).
*/
le16_add_cpu(&left_el->l_next_free_rec, -1);
ocfs2_shift_records_right(left_el);
memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&left_el->l_next_free_rec, 1);
ret = ocfs2_journal_dirty(handle, left_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
subtree_index);
out:
return ret;
}
/*
* Given a full path, determine what cpos value would return us a path
* containing the leaf immediately to the left of the current one.
*
* Will return zero if the path passed in is already the leftmost path.
*/
static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
struct ocfs2_extent_list *el;
BUG_ON(path->p_tree_depth == 0);
*cpos = 0;
blkno = path_leaf_bh(path)->b_blocknr;
/* Start at the tree node just above the leaf and work our way up. */
i = path->p_tree_depth - 1;
while (i >= 0) {
el = path->p_node[i].el;
/*
* Find the extent record just before the one in our
* path.
*/
for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
if (j == 0) {
if (i == 0) {
/*
* We've determined that the
* path specified is already
* the leftmost one - return a
* cpos of zero.
*/
goto out;
}
/*
* The leftmost record points to our
* leaf - we need to travel up the
* tree one level.
*/
goto next_node;
}
*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
*cpos = *cpos + ocfs2_rec_clusters(el,
&el->l_recs[j - 1]);
*cpos = *cpos - 1;
goto out;
}
}
/*
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
ocfs2_error(sb,
"Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
next_node:
blkno = path->p_node[i].bh->b_blocknr;
i--;
}
out:
return ret;
}
static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
struct ocfs2_path *path)
{
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
if (handle->h_buffer_credits < credits)
return ocfs2_extend_trans(handle, credits);
return 0;
}
/*
* Trap the case where we're inserting into the theoretical range past
* the _actual_ left leaf range. Otherwise, we'll rotate a record
* whose cpos is less than ours into the right leaf.
*
* It's only necessary to look at the rightmost record of the left
* leaf because the logic that calls us should ensure that the
* theoretical ranges in the path components above the leaves are
* correct.
*/
static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
u32 insert_cpos)
{
struct ocfs2_extent_list *left_el;
struct ocfs2_extent_rec *rec;
int next_free;
left_el = path_leaf_el(left_path);
next_free = le16_to_cpu(left_el->l_next_free_rec);
rec = &left_el->l_recs[next_free - 1];
if (insert_cpos > le32_to_cpu(rec->e_cpos))
return 1;
return 0;
}
/*
* Rotate all the records in a btree right one record, starting at insert_cpos.
*
* The path to the rightmost leaf should be passed in.
*
* The array is assumed to be large enough to hold an entire path (tree depth).
*
* Upon succesful return from this function:
*
* - The 'right_path' array will contain a path to the leaf block
* whose range contains e_cpos.
* - That leaf block will have a single empty extent in list index 0.
* - In the case that the rotation requires a post-insert update,
* *ret_left_path will contain a valid path which can be passed to
* ocfs2_insert_path().
*/
static int ocfs2_rotate_tree_right(struct inode *inode,
handle_t *handle,
u32 insert_cpos,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
int ret, start;
u32 cpos;
struct ocfs2_path *left_path = NULL;
*ret_left_path = NULL;
left_path = ocfs2_new_path(path_root_bh(right_path),
path_root_el(right_path));
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
/*
* What we want to do here is:
*
* 1) Start with the rightmost path.
*
* 2) Determine a path to the leaf block directly to the left
* of that leaf.
*
* 3) Determine the 'subtree root' - the lowest level tree node
* which contains a path to both leaves.
*
* 4) Rotate the subtree.
*
* 5) Find the next subtree by considering the left path to be
* the new right path.
*
* The check at the top of this while loop also accepts
* insert_cpos == cpos because cpos is only a _theoretical_
* value to get us the left path - insert_cpos might very well
* be filling that hole.
*
* Stop at a cpos of '0' because we either started at the
* leftmost branch (i.e., a tree with one branch and a
* rotation inside of it), or we've gone as far as we can in
* rotating subtrees.
*/
while (cpos && insert_cpos <= cpos) {
mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
insert_cpos, cpos);
ret = ocfs2_find_path(inode, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
mlog_bug_on_msg(path_leaf_bh(left_path) ==
path_leaf_bh(right_path),
"Inode %lu: error during insert of %u "
"(left path cpos %u) results in two identical "
"paths ending at %llu\n",
inode->i_ino, insert_cpos, cpos,
(unsigned long long)
path_leaf_bh(left_path)->b_blocknr);
if (ocfs2_rotate_requires_path_adjustment(left_path,
insert_cpos)) {
mlog(0, "Path adjustment required\n");
/*
* We've rotated the tree as much as we
* should. The rest is up to
* ocfs2_insert_path() to complete, after the
* record insertion. We indicate this
* situation by returning the left path.
*
* The reason we don't adjust the records here
* before the record insert is that an error
* later might break the rule where a parent
* record e_cpos will reflect the actual
* e_cpos of the 1st nonempty record of the
* child list.
*/
*ret_left_path = left_path;
goto out_ret_path;
}
start = ocfs2_find_subtree_root(inode, left_path, right_path);
mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
start,
(unsigned long long) right_path->p_node[start].bh->b_blocknr,
right_path->p_tree_depth);
ret = ocfs2_extend_rotate_transaction(handle, start,
right_path);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
right_path, start);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* There is no need to re-read the next right path
* as we know that it'll be our current left
* path. Optimize by copying values instead.
*/
ocfs2_mv_path(right_path, left_path);
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
&cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
}
out:
ocfs2_free_path(left_path);
out_ret_path:
return ret;
}
/*
* Do the final bits of extent record insertion at the target leaf
* list. If this leaf is part of an allocation tree, it is assumed
* that the tree above has been prepared.
*/
static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
struct ocfs2_extent_list *el,
struct ocfs2_insert_type *insert,
struct inode *inode)
{
int i = insert->ins_contig_index;
unsigned int range;
struct ocfs2_extent_rec *rec;
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
/*
* Contiguous insert - either left or right.
*/
if (insert->ins_contig != CONTIG_NONE) {
rec = &el->l_recs[i];
if (insert->ins_contig == CONTIG_LEFT) {
rec->e_blkno = insert_rec->e_blkno;
rec->e_cpos = insert_rec->e_cpos;
}
le16_add_cpu(&rec->e_leaf_clusters,
le16_to_cpu(insert_rec->e_leaf_clusters));
return;
}
/*
* Handle insert into an empty leaf.
*/
if (le16_to_cpu(el->l_next_free_rec) == 0 ||
((le16_to_cpu(el->l_next_free_rec) == 1) &&
ocfs2_is_empty_extent(&el->l_recs[0]))) {
el->l_recs[0] = *insert_rec;
el->l_next_free_rec = cpu_to_le16(1);
return;
}
/*
* Appending insert.
*/
if (insert->ins_appending == APPEND_TAIL) {
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
range = le32_to_cpu(rec->e_cpos)
+ le16_to_cpu(rec->e_leaf_clusters);
BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
le16_to_cpu(el->l_count),
"inode %lu, depth %u, count %u, next free %u, "
"rec.cpos %u, rec.clusters %u, "
"insert.cpos %u, insert.clusters %u\n",
inode->i_ino,
le16_to_cpu(el->l_tree_depth),
le16_to_cpu(el->l_count),
le16_to_cpu(el->l_next_free_rec),
le32_to_cpu(el->l_recs[i].e_cpos),
le16_to_cpu(el->l_recs[i].e_leaf_clusters),
le32_to_cpu(insert_rec->e_cpos),
le16_to_cpu(insert_rec->e_leaf_clusters));
i++;
el->l_recs[i] = *insert_rec;
le16_add_cpu(&el->l_next_free_rec, 1);
return;
}
/*
* Ok, we have to rotate.
*
* At this point, it is safe to assume that inserting into an
* empty leaf and appending to a leaf have both been handled
* above.
*
* This leaf needs to have space, either by the empty 1st
* extent record, or by virtue of an l_next_rec < l_count.
*/
ocfs2_rotate_leaf(el, insert_rec);
}
static inline void ocfs2_update_dinode_clusters(struct inode *inode,
struct ocfs2_dinode *di,
u32 clusters)
{
le32_add_cpu(&di->i_clusters, clusters);
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
int ret, i, next_free;
struct buffer_head *bh;
struct ocfs2_extent_list *el;
struct ocfs2_path *left_path = NULL;
*ret_left_path = NULL;
/*
* This shouldn't happen for non-trees. The extent rec cluster
* count manipulation below only works for interior nodes.
*/
BUG_ON(right_path->p_tree_depth == 0);
/*
* If our appending insert is at the leftmost edge of a leaf,
* then we might need to update the rightmost records of the
* neighboring path.
*/
el = path_leaf_el(right_path);
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0 ||
(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
u32 left_cpos;
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
&left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
mlog(0, "Append may need a left path update. cpos: %u, "
"left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
left_cpos);
/*
* No need to worry if the append is already in the
* leftmost leaf.
*/
if (left_cpos) {
left_path = ocfs2_new_path(path_root_bh(right_path),
path_root_el(right_path));
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_path(inode, left_path, left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* ocfs2_insert_path() will pass the left_path to the
* journal for us.
*/
}
}
ret = ocfs2_journal_access_path(inode, handle, right_path);
if (ret) {
mlog_errno(ret);
goto out;
}
el = path_root_el(right_path);
bh = path_root_bh(right_path);
i = 0;
while (1) {
struct ocfs2_extent_rec *rec;
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(inode->i_sb,
"Dinode %llu has a bad extent list",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EIO;
goto out;
}
rec = &el->l_recs[next_free - 1];
rec->e_int_clusters = insert_rec->e_cpos;
le32_add_cpu(&rec->e_int_clusters,
le16_to_cpu(insert_rec->e_leaf_clusters));
le32_add_cpu(&rec->e_int_clusters,
-le32_to_cpu(rec->e_cpos));
ret = ocfs2_journal_dirty(handle, bh);
if (ret)
mlog_errno(ret);
/* Don't touch the leaf node */
if (++i >= right_path->p_tree_depth)
break;
bh = right_path->p_node[i].bh;
el = right_path->p_node[i].el;
}
*ret_left_path = left_path;
ret = 0;
out:
if (ret != 0)
ocfs2_free_path(left_path);
return ret;
}
/*
* This function only does inserts on an allocation b-tree. For dinode
* lists, ocfs2_insert_at_leaf() is called directly.
*
* right_path is the path we want to do the actual insert
* in. left_path should only be passed in if we need to update that
* portion of the tree after an edge insert.
*/
static int ocfs2_insert_path(struct inode *inode,
handle_t *handle,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *insert)
{
int ret, subtree_index;
struct buffer_head *leaf_bh = path_leaf_bh(right_path);
struct ocfs2_extent_list *el;
/*
* Pass both paths to the journal. The majority of inserts
* will be touching all components anyway.
*/
ret = ocfs2_journal_access_path(inode, handle, right_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (left_path) {
int credits = handle->h_buffer_credits;
/*
* There's a chance that left_path got passed back to
* us without being accounted for in the
* journal. Extend our transaction here to be sure we
* can change those blocks.
*/
credits += left_path->p_tree_depth;
ret = ocfs2_extend_trans(handle, credits);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_path(inode, handle, left_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
}
el = path_leaf_el(right_path);
ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
ret = ocfs2_journal_dirty(handle, leaf_bh);
if (ret)
mlog_errno(ret);
if (left_path) {
/*
* The rotate code has indicated that we need to fix
* up portions of the tree after the insert.
*
* XXX: Should we extend the transaction here?
*/
subtree_index = ocfs2_find_subtree_root(inode, left_path,
right_path);
ocfs2_complete_edge_insert(inode, handle, left_path,
right_path, subtree_index);
}
ret = 0;
out:
return ret;
}
static int ocfs2_do_insert_extent(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *type)
{
int ret, rotate = 0;
u32 cpos;
struct ocfs2_path *right_path = NULL;
struct ocfs2_path *left_path = NULL;
struct ocfs2_dinode *di;
struct ocfs2_extent_list *el;
di = (struct ocfs2_dinode *) di_bh->b_data;
el = &di->id2.i_list;
ret = ocfs2_journal_access(handle, inode, di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
if (le16_to_cpu(el->l_tree_depth) == 0) {
ocfs2_insert_at_leaf(insert_rec, el, type, inode);
goto out_update_clusters;
}
right_path = ocfs2_new_inode_path(di_bh);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
/*
* Determine the path to start with. Rotations need the
* rightmost path, everything else can go directly to the
* target leaf.
*/
cpos = le32_to_cpu(insert_rec->e_cpos);
if (type->ins_appending == APPEND_NONE &&
type->ins_contig == CONTIG_NONE) {
rotate = 1;
cpos = UINT_MAX;
}
ret = ocfs2_find_path(inode, right_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* Rotations and appends need special treatment - they modify
* parts of the tree's above them.
*
* Both might pass back a path immediate to the left of the
* one being inserted to. This will be cause
* ocfs2_insert_path() to modify the rightmost records of
* left_path to account for an edge insert.
*
* XXX: When modifying this code, keep in mind that an insert
* can wind up skipping both of these two special cases...
*/
if (rotate) {
ret = ocfs2_rotate_tree_right(inode, handle,
le32_to_cpu(insert_rec->e_cpos),
right_path, &left_path);
if (ret) {
mlog_errno(ret);
goto out;
}
} else if (type->ins_appending == APPEND_TAIL
&& type->ins_contig != CONTIG_LEFT) {
ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
right_path, &left_path);
if (ret) {
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_insert_path(inode, handle, left_path, right_path,
insert_rec, type);
if (ret) {
mlog_errno(ret);
goto out;
}
out_update_clusters:
ocfs2_update_dinode_clusters(inode, di,
le16_to_cpu(insert_rec->e_leaf_clusters));
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret)
mlog_errno(ret);
out:
ocfs2_free_path(left_path);
ocfs2_free_path(right_path);
return ret;
}
static void ocfs2_figure_contig_type(struct inode *inode,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *insert_rec)
{ {
int status, i, shift; int i;
struct buffer_head *last_eb_bh = NULL; enum ocfs2_contig_type contig_type = CONTIG_NONE;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe; BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
insert_rec);
if (contig_type != CONTIG_NONE) {
insert->ins_contig_index = i;
break;
}
}
insert->ins_contig = contig_type;
}
/*
* This should only be called against the righmost leaf extent list.
*
* ocfs2_figure_appending_type() will figure out whether we'll have to
* insert at the tail of the rightmost leaf.
*
* This should also work against the dinode list for tree's with 0
* depth. If we consider the dinode list to be the rightmost leaf node
* then the logic here makes sense.
*/
static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *insert_rec)
{
int i;
u32 cpos = le32_to_cpu(insert_rec->e_cpos);
struct ocfs2_extent_rec *rec;
insert->ins_appending = APPEND_NONE;
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
if (!el->l_next_free_rec)
goto set_tail_append;
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
/* Were all records empty? */
if (le16_to_cpu(el->l_next_free_rec) == 1)
goto set_tail_append;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
if (cpos >=
(le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
goto set_tail_append;
return;
set_tail_append:
insert->ins_appending = APPEND_TAIL;
}
/*
* Helper function called at the begining of an insert.
*
* This computes a few things that are commonly used in the process of
* inserting into the btree:
* - Whether the new extent is contiguous with an existing one.
* - The current tree depth.
* - Whether the insert is an appending one.
* - The total # of free records in the tree.
*
* All of the information is stored on the ocfs2_insert_type
* structure.
*/
static int ocfs2_figure_insert_type(struct inode *inode,
struct buffer_head *di_bh,
struct buffer_head **last_eb_bh,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *insert)
{
int ret;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_extent_block *eb; struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
struct ocfs2_path *path = NULL;
struct buffer_head *bh = NULL;
mlog_entry_void(); el = &di->id2.i_list;
insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
mlog(0, "add %u clusters starting at block %llu to inode %llu\n", if (el->l_tree_depth) {
new_clusters, (unsigned long long)start_blk, /*
(unsigned long long)OCFS2_I(inode)->ip_blkno); * If we have tree depth, we read in the
* rightmost extent block ahead of time as
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
le64_to_cpu(di->i_last_eb_blk), &bh,
OCFS2_BH_CACHED, inode);
if (ret) {
mlog_exit(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
}
fe = (struct ocfs2_dinode *) fe_bh->b_data; /*
el = &fe->id2.i_list; * Unless we have a contiguous insert, we'll need to know if
* there is room left in our allocation tree for another
* extent record.
*
* XXX: This test is simplistic, we can search for empty
* extent records too.
*/
insert->ins_free_records = le16_to_cpu(el->l_count) -
le16_to_cpu(el->l_next_free_rec);
if (el->l_tree_depth) { if (!insert->ins_tree_depth) {
/* jump to end of tree */ ocfs2_figure_contig_type(inode, insert, el, insert_rec);
status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), ocfs2_figure_appending_type(insert, el, insert_rec);
&last_eb_bh, OCFS2_BH_CACHED, inode); return 0;
}
path = ocfs2_new_inode_path(di_bh);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
/*
* In the case that we're inserting past what the tree
* currently accounts for, ocfs2_find_path() will return for
* us the rightmost tree path. This is accounted for below in
* the appending code.
*/
ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
if (ret) {
mlog_errno(ret);
goto out;
}
el = path_leaf_el(path);
/*
* Now that we have the path, there's two things we want to determine:
* 1) Contiguousness (also set contig_index if this is so)
*
* 2) Are we doing an append? We can trivially break this up
* into two types of appends: simple record append, or a
* rotate inside the tail leaf.
*/
ocfs2_figure_contig_type(inode, insert, el, insert_rec);
/*
* The insert code isn't quite ready to deal with all cases of
* left contiguousness. Specifically, if it's an insert into
* the 1st record in a leaf, it will require the adjustment of
* cluster count on the last record of the path directly to it's
* left. For now, just catch that case and fool the layers
* above us. This works just fine for tree_depth == 0, which
* is why we allow that above.
*/
if (insert->ins_contig == CONTIG_LEFT &&
insert->ins_contig_index == 0)
insert->ins_contig = CONTIG_NONE;
/*
* Ok, so we can simply compare against last_eb to figure out
* whether the path doesn't exist. This will only happen in
* the case that we're doing a tail append, so maybe we can
* take advantage of that information somehow.
*/
if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
/*
* Ok, ocfs2_find_path() returned us the rightmost
* tree path. This might be an appending insert. There are
* two cases:
* 1) We're doing a true append at the tail:
* -This might even be off the end of the leaf
* 2) We're "appending" by rotating in the tail
*/
ocfs2_figure_appending_type(insert, el, insert_rec);
}
out:
ocfs2_free_path(path);
if (ret == 0)
*last_eb_bh = bh;
else
brelse(bh);
return ret;
}
/*
* Insert an extent into an inode btree.
*
* The caller needs to update fe->i_clusters
*/
int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u32 cpos,
u64 start_blk,
u32 new_clusters,
struct ocfs2_alloc_context *meta_ac)
{
int status, shift;
struct buffer_head *last_eb_bh = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
mlog(0, "add %u clusters at position %u to inode %llu\n",
new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
(OCFS2_I(inode)->ip_clusters != cpos),
"Device %s, asking for sparse allocation: inode %llu, "
"cpos %u, clusters %u\n",
osb->dev_str,
(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
OCFS2_I(inode)->ip_clusters);
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
&insert);
if (status < 0) { if (status < 0) {
mlog_exit(status); mlog_errno(status);
goto bail; goto bail;
} }
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
el = &eb->h_list;
}
/* Can we allocate without adding/shifting tree bits? */ mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
i = le16_to_cpu(el->l_next_free_rec) - 1; "Insert.contig_index: %d, Insert.free_records: %d, "
if (le16_to_cpu(el->l_next_free_rec) == 0 "Insert.tree_depth: %d\n",
|| (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
|| le32_to_cpu(el->l_recs[i].e_clusters) == 0 insert.ins_free_records, insert.ins_tree_depth);
|| ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
goto out_add;
mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " /*
"tree now.\n"); * Avoid growing the tree unless we're out of records and the
* insert type requres one.
*/
if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
goto out_add;
shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
if (shift < 0) { if (shift < 0) {
...@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, ...@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
* and didn't find room for any more extents - we need to add * and didn't find room for any more extents - we need to add
* another tree level */ * another tree level */
if (shift) { if (shift) {
/* if we hit a leaf, we'd better be empty :) */
BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
le16_to_cpu(el->l_count));
BUG_ON(bh); BUG_ON(bh);
mlog(0, "ocfs2_allocate_extent: need to shift tree depth " mlog(0, "need to shift tree depth "
"(current = %u)\n", "(current = %d)\n", insert.ins_tree_depth);
le16_to_cpu(fe->id2.i_list.l_tree_depth));
/* ocfs2_shift_tree_depth will return us a buffer with /* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to * the new extent block (so we can pass that to
...@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, ...@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
insert.ins_tree_depth++;
/* Special case: we have room now if we shifted from /* Special case: we have room now if we shifted from
* tree_depth 0 */ * tree_depth 0 */
if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) if (insert.ins_tree_depth == 1)
goto out_add; goto out_add;
} }
/* call ocfs2_add_branch to add the final part of the tree with /* call ocfs2_add_branch to add the final part of the tree with
* the new data. */ * the new data. */
mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); mlog(0, "add branch. bh = %p\n", bh);
status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
meta_ac); meta_ac);
if (status < 0) { if (status < 0) {
...@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, ...@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
} }
out_add: out_add:
/* Finally, we can add clusters. */ /* Finally, we can add clusters. This might rotate the tree for us. */
status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
start_blk, new_clusters);
if (status < 0) if (status < 0)
mlog_errno(status); mlog_errno(status);
else
ocfs2_extent_map_insert_rec(inode, &rec);
bail: bail:
if (bh) if (bh)
...@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) ...@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* block will be deleted, and if it will, what the new last extent * block will be deleted, and if it will, what the new last extent
* block will be so we can update his h_next_leaf_blk field, as well * block will be so we can update his h_next_leaf_blk field, as well
* as the dinodes i_last_eb_blk */ * as the dinodes i_last_eb_blk */
static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, static int ocfs2_find_new_last_ext_blk(struct inode *inode,
struct inode *inode, unsigned int clusters_to_del,
struct ocfs2_dinode *fe, struct ocfs2_path *path,
u32 new_i_clusters,
struct buffer_head *old_last_eb,
struct buffer_head **new_last_eb) struct buffer_head **new_last_eb)
{ {
int i, status = 0; int next_free, ret = 0;
u64 block = 0; u32 cpos;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb; struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
*new_last_eb = NULL; *new_last_eb = NULL;
if (!OCFS2_IS_VALID_DINODE(fe)) {
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
status = -EIO;
goto bail;
}
/* we have no tree, so of course, no last_eb. */ /* we have no tree, so of course, no last_eb. */
if (!fe->id2.i_list.l_tree_depth) if (!path->p_tree_depth)
goto bail; goto out;
/* trunc to zero special case - this makes tree_depth = 0 /* trunc to zero special case - this makes tree_depth = 0
* regardless of what it is. */ * regardless of what it is. */
if (!new_i_clusters) if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
goto bail; goto out;
eb = (struct ocfs2_extent_block *) old_last_eb->b_data; el = path_leaf_el(path);
el = &(eb->h_list);
BUG_ON(!el->l_next_free_rec); BUG_ON(!el->l_next_free_rec);
/* Make sure that this guy will actually be empty after we /*
* clear away the data. */ * Make sure that this extent list will actually be empty
if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) * after we clear away the data. We can shortcut out if
goto bail; * there's more than one non-empty extent in the
* list. Otherwise, a check of the remaining extent is
* necessary.
*/
next_free = le16_to_cpu(el->l_next_free_rec);
rec = NULL;
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
if (next_free > 2)
goto out;
/* We may have a valid extent in index 1, check it. */
if (next_free == 2)
rec = &el->l_recs[1];
/*
* Fall through - no more nonempty extents, so we want
* to delete this leaf.
*/
} else {
if (next_free > 1)
goto out;
rec = &el->l_recs[0];
}
if (rec) {
/*
* Check it we'll only be trimming off the end of this
* cluster.
*/
if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
goto out;
}
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EROFS;
goto out;
}
*new_last_eb = bh;
get_bh(*new_last_eb);
mlog(0, "returning block %llu, (cpos: %u)\n",
(unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
out:
brelse(bh);
return ret;
}
/*
* Trim some clusters off the rightmost edge of a tree. Only called
* during truncate.
*
* The caller needs to:
* - start journaling of each path component.
* - compute and fully set up any new last ext block
*/
static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
handle_t *handle, struct ocfs2_truncate_context *tc,
u32 clusters_to_del, u64 *delete_start)
{
int ret, i, index = path->p_tree_depth;
u32 new_edge = 0;
u64 deleted_eb = 0;
struct buffer_head *bh;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
*delete_start = 0;
while (index >= 0) {
bh = path->p_node[index].bh;
el = path->p_node[index].el;
mlog(0, "traveling tree (index = %d, block = %llu)\n",
index, (unsigned long long)bh->b_blocknr);
BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
if (index !=
(path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
ocfs2_error(inode->i_sb,
"Inode %lu has invalid ext. block %llu",
inode->i_ino,
(unsigned long long)bh->b_blocknr);
ret = -EROFS;
goto out;
}
find_tail_record:
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
"next = %u\n", i, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec),
(unsigned long long)le64_to_cpu(rec->e_blkno),
le16_to_cpu(el->l_next_free_rec));
BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
if (le16_to_cpu(el->l_tree_depth) == 0) {
/*
* If the leaf block contains a single empty
* extent and no records, we can just remove
* the block.
*/
if (i == 0 && ocfs2_is_empty_extent(rec)) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
el->l_next_free_rec = cpu_to_le16(0);
goto delete;
}
/*
* Remove any empty extents by shifting things
* left. That should make life much easier on
* the code below. This condition is rare
* enough that we shouldn't see a performance
* hit.
*/
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
le16_add_cpu(&el->l_next_free_rec, -1);
for(i = 0;
i < le16_to_cpu(el->l_next_free_rec); i++)
el->l_recs[i] = el->l_recs[i + 1];
memset(&el->l_recs[i], 0,
sizeof(struct ocfs2_extent_rec));
/*
* We've modified our extent list. The
* simplest way to handle this change
* is to being the search from the
* start again.
*/
goto find_tail_record;
}
le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
/*
* We'll use "new_edge" on our way back up the
* tree to know what our rightmost cpos is.
*/
new_edge = le16_to_cpu(rec->e_leaf_clusters);
new_edge += le32_to_cpu(rec->e_cpos);
/*
* The caller will use this to delete data blocks.
*/
*delete_start = le64_to_cpu(rec->e_blkno)
+ ocfs2_clusters_to_blocks(inode->i_sb,
le16_to_cpu(rec->e_leaf_clusters));
/*
* If it's now empty, remove this record.
*/
if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&el->l_next_free_rec, -1);
}
} else {
if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
memset(rec, 0,
sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&el->l_next_free_rec, -1);
goto delete;
}
/* Can this actually happen? */
if (le16_to_cpu(el->l_next_free_rec) == 0)
goto delete;
/*
* We never actually deleted any clusters
* because our leaf was empty. There's no
* reason to adjust the rightmost edge then.
*/
if (new_edge == 0)
goto delete;
rec->e_int_clusters = cpu_to_le32(new_edge);
le32_add_cpu(&rec->e_int_clusters,
-le32_to_cpu(rec->e_cpos));
/*
* A deleted child record should have been
* caught above.
*/
BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
}
delete:
ret = ocfs2_journal_dirty(handle, bh);
if (ret) {
mlog_errno(ret);
goto out;
}
mlog(0, "extent list container %llu, after: record %d: "
"(%u, %u, %llu), next = %u.\n",
(unsigned long long)bh->b_blocknr, i,
le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
(unsigned long long)le64_to_cpu(rec->e_blkno),
le16_to_cpu(el->l_next_free_rec));
/*
* We must be careful to only attempt delete of an
* extent block (and not the root inode block).
*/
if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
struct ocfs2_extent_block *eb =
(struct ocfs2_extent_block *)bh->b_data;
/*
* Save this for use when processing the
* parent block.
*/
deleted_eb = le64_to_cpu(eb->h_blkno);
mlog(0, "deleting this extent block.\n");
ocfs2_remove_from_cache(inode, bh);
BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
/*
* This code only understands how to
* lock the suballocator in slot 0,
* which is fine because allocation is
* only ever done out of that
* suballocator too. A future version
* might change that however, so avoid
* a free if we don't know how to
* handle it. This way an fs incompat
* bit will not be necessary.
*/
ret = ocfs2_free_extent_block(handle,
tc->tc_ext_alloc_inode,
tc->tc_ext_alloc_bh,
eb);
/* Ok, at this point, we know that last_eb will definitely /* An error here is not fatal. */
* change, so lets traverse the tree and find the second to if (ret < 0)
* last extent block. */ mlog_errno(ret);
el = &(fe->id2.i_list);
/* go down the tree, */
do {
for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
if (le32_to_cpu(el->l_recs[i].e_cpos) <
new_i_clusters) {
block = le64_to_cpu(el->l_recs[i].e_blkno);
break;
}
} }
BUG_ON(i < 0); } else {
deleted_eb = 0;
if (bh) {
brelse(bh);
bh = NULL;
} }
status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, index--;
inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
status = -EIO;
goto bail;
} }
} while (el->l_tree_depth);
*new_last_eb = bh;
get_bh(*new_last_eb);
mlog(0, "returning block %llu\n",
(unsigned long long)le64_to_cpu(eb->h_blkno));
bail:
if (bh)
brelse(bh);
return status; ret = 0;
out:
return ret;
} }
static int ocfs2_do_truncate(struct ocfs2_super *osb, static int ocfs2_do_truncate(struct ocfs2_super *osb,
unsigned int clusters_to_del, unsigned int clusters_to_del,
struct inode *inode, struct inode *inode,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
struct buffer_head *old_last_eb_bh,
handle_t *handle, handle_t *handle,
struct ocfs2_truncate_context *tc) struct ocfs2_truncate_context *tc,
struct ocfs2_path *path)
{ {
int status, i, depth; int status;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_block *last_eb = NULL; struct ocfs2_extent_block *last_eb = NULL;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
struct buffer_head *eb_bh = NULL;
struct buffer_head *last_eb_bh = NULL; struct buffer_head *last_eb_bh = NULL;
u64 next_eb = 0;
u64 delete_blk = 0; u64 delete_blk = 0;
fe = (struct ocfs2_dinode *) fe_bh->b_data; fe = (struct ocfs2_dinode *) fe_bh->b_data;
status = ocfs2_find_new_last_ext_blk(osb, status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
inode, path, &last_eb_bh);
fe,
le32_to_cpu(fe->i_clusters) -
clusters_to_del,
old_last_eb_bh,
&last_eb_bh);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
if (last_eb_bh)
last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
status = ocfs2_journal_access(handle, inode, fe_bh, /*
* Each component will be touched, so we might as well journal
* here to avoid having to handle errors later.
*/
status = ocfs2_journal_access_path(inode, handle, path);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (last_eb_bh) {
status = ocfs2_journal_access(handle, inode, last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE); OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
}
el = &(fe->id2.i_list); el = &(fe->id2.i_list);
/*
* Lower levels depend on this never happening, but it's best
* to check it up here before changing the tree.
*/
if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
ocfs2_error(inode->i_sb,
"Inode %lu has an empty extent record, depth %u\n",
inode->i_ino, le16_to_cpu(el->l_tree_depth));
status = -EROFS;
goto bail;
}
spin_lock(&OCFS2_I(inode)->ip_lock); spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
clusters_to_del; clusters_to_del;
spin_unlock(&OCFS2_I(inode)->ip_lock); spin_unlock(&OCFS2_I(inode)->ip_lock);
le32_add_cpu(&fe->i_clusters, -clusters_to_del); le32_add_cpu(&fe->i_clusters, -clusters_to_del);
fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
i = le16_to_cpu(el->l_next_free_rec) - 1;
BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); status = ocfs2_trim_tree(inode, path, handle, tc,
le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); clusters_to_del, &delete_blk);
/* tree depth zero, we can just delete the clusters, otherwise if (status) {
* we need to record the offset of the next level extent block mlog_errno(status);
* as we may overwrite it. */ goto bail;
if (!el->l_tree_depth)
delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+ ocfs2_clusters_to_blocks(osb->sb,
le32_to_cpu(el->l_recs[i].e_clusters));
else
next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
if (!el->l_recs[i].e_clusters) {
/* if we deleted the whole extent record, then clear
* out the other fields and update the extent
* list. For depth > 0 trees, we've already recorded
* the extent block in 'next_eb' */
el->l_recs[i].e_cpos = 0;
el->l_recs[i].e_blkno = 0;
BUG_ON(!el->l_next_free_rec);
le16_add_cpu(&el->l_next_free_rec, -1);
} }
depth = le16_to_cpu(el->l_tree_depth); if (le32_to_cpu(fe->i_clusters) == 0) {
if (!fe->i_clusters) {
/* trunc to zero is a special case. */ /* trunc to zero is a special case. */
el->l_tree_depth = 0; el->l_tree_depth = 0;
fe->i_last_eb_blk = 0; fe->i_last_eb_blk = 0;
...@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, ...@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
/* If there will be a new last extent block, then by /* If there will be a new last extent block, then by
* definition, there cannot be any leaves to the right of * definition, there cannot be any leaves to the right of
* him. */ * him. */
status = ocfs2_journal_access(handle, inode, last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
last_eb->h_next_leaf_blk = 0; last_eb->h_next_leaf_blk = 0;
status = ocfs2_journal_dirty(handle, last_eb_bh); status = ocfs2_journal_dirty(handle, last_eb_bh);
if (status < 0) { if (status < 0) {
...@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, ...@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
} }
} }
/* if our tree depth > 0, update all the tree blocks below us. */ if (delete_blk) {
while (depth) { status = ocfs2_truncate_log_append(osb, handle, delete_blk,
mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", clusters_to_del);
depth, (unsigned long long)next_eb);
status = ocfs2_read_block(osb, next_eb, &eb_bh,
OCFS2_BH_CACHED, inode);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
status = -EIO;
goto bail;
} }
el = &(eb->h_list); status = 0;
bail:
status = ocfs2_journal_access(handle, inode, eb_bh, mlog_exit(status);
OCFS2_JOURNAL_ACCESS_WRITE); return status;
if (status < 0) { }
mlog_errno(status);
goto bail;
}
BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); {
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return 0;
}
i = le16_to_cpu(el->l_next_free_rec) - 1; static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return ocfs2_journal_dirty_data(handle, bh);
}
mlog(0, "extent block %llu, before: record %d: " static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
"(%u, %u, %llu), next = %u\n", struct page **pages, int numpages,
(unsigned long long)le64_to_cpu(eb->h_blkno), i, u64 phys, handle_t *handle)
le32_to_cpu(el->l_recs[i].e_cpos), {
le32_to_cpu(el->l_recs[i].e_clusters), int i, ret, partial = 0;
(unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), void *kaddr;
le16_to_cpu(el->l_next_free_rec)); struct page *page;
unsigned int from, to = PAGE_CACHE_SIZE;
struct super_block *sb = inode->i_sb;
BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
if (numpages == 0)
next_eb = le64_to_cpu(el->l_recs[i].e_blkno); goto out;
/* bottom-most block requires us to delete data.*/
if (!el->l_tree_depth) from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+ ocfs2_clusters_to_blocks(osb->sb, /*
le32_to_cpu(el->l_recs[i].e_clusters)); * Since 'from' has been capped to a value below page
if (!el->l_recs[i].e_clusters) { * size, this calculation won't be able to overflow
el->l_recs[i].e_cpos = 0; * 'to'
el->l_recs[i].e_blkno = 0; */
BUG_ON(!el->l_next_free_rec); to = ocfs2_align_bytes_to_clusters(sb, from);
le16_add_cpu(&el->l_next_free_rec, -1);
/*
* The truncate tail in this case should never contain
* more than one page at maximum. The loop below also
* assumes this.
*/
BUG_ON(numpages != 1);
} }
mlog(0, "extent block %llu, after: record %d: "
"(%u, %u, %llu), next = %u\n",
(unsigned long long)le64_to_cpu(eb->h_blkno), i,
le32_to_cpu(el->l_recs[i].e_cpos),
le32_to_cpu(el->l_recs[i].e_clusters),
(unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
le16_to_cpu(el->l_next_free_rec));
status = ocfs2_journal_dirty(handle, eb_bh); for(i = 0; i < numpages; i++) {
if (status < 0) { page = pages[i];
mlog_errno(status);
goto bail; BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
if (ret)
mlog_errno(ret);
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + from, 0, to - from);
kunmap_atomic(kaddr, KM_USER0);
/*
* Need to set the buffers we zero'd into uptodate
* here if they aren't - ocfs2_map_page_blocks()
* might've skipped some
*/
if (ocfs2_should_order_data(inode)) {
ret = walk_page_buffers(handle,
page_buffers(page),
from, to, &partial,
ocfs2_ordered_zero_func);
if (ret < 0)
mlog_errno(ret);
} else {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, &partial,
ocfs2_writeback_zero_func);
if (ret < 0)
mlog_errno(ret);
} }
if (!el->l_next_free_rec) { if (!partial)
mlog(0, "deleting this extent block.\n"); SetPageUptodate(page);
ocfs2_remove_from_cache(inode, eb_bh); flush_dcache_page(page);
BUG_ON(el->l_recs[0].e_clusters);
BUG_ON(el->l_recs[0].e_cpos);
BUG_ON(el->l_recs[0].e_blkno);
if (eb->h_suballoc_slot == 0) {
/* /*
* This code only understands how to * Every page after the 1st one should be completely zero'd.
* lock the suballocator in slot 0,
* which is fine because allocation is
* only ever done out of that
* suballocator too. A future version
* might change that however, so avoid
* a free if we don't know how to
* handle it. This way an fs incompat
* bit will not be necessary.
*/ */
status = ocfs2_free_extent_block(handle, from = 0;
tc->tc_ext_alloc_inode,
tc->tc_ext_alloc_bh,
eb);
if (status < 0) {
mlog_errno(status);
goto bail;
} }
out:
if (pages) {
for (i = 0; i < numpages; i++) {
page = pages[i];
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
} }
} }
brelse(eb_bh); }
eb_bh = NULL;
depth--; static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
int *num, u64 *phys)
{
int i, numpages = 0, ret = 0;
unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
unsigned int ext_flags;
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
u64 next_cluster_bytes;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
/* Cluster boundary, so we don't need to grab any pages. */
if ((isize & (csize - 1)) == 0)
goto out;
ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
phys, NULL, &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
} }
BUG_ON(!delete_blk); /* Tail is a hole. */
status = ocfs2_truncate_log_append(osb, handle, delete_blk, if (*phys == 0)
clusters_to_del); goto out;
if (status < 0) {
mlog_errno(status); /* Tail is marked as unwritten, we can count on write to zero
goto bail; * in that case. */
if (ext_flags & OCFS2_EXT_UNWRITTEN)
goto out;
next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
index = isize >> PAGE_CACHE_SHIFT;
do {
pages[numpages] = grab_cache_page(mapping, index);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
} }
status = 0;
bail: numpages++;
if (!status) index++;
ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
else
ocfs2_extent_map_drop(inode, 0); out:
mlog_exit(status); if (ret != 0) {
return status; if (pages) {
for (i = 0; i < numpages; i++) {
if (pages[i]) {
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
}
}
numpages = 0;
}
*num = numpages;
return ret;
}
/*
* Zero the area past i_size but still within an allocated
* cluster. This avoids exposing nonzero data on subsequent file
* extends.
*
* We need to call this before i_size is updated on the inode because
* otherwise block_write_full_page() will skip writeout of pages past
* i_size. The new_i_size parameter is passed for this reason.
*/
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
u64 new_i_size)
{
int ret, numpages;
loff_t endbyte;
struct page **pages = NULL;
u64 phys;
/*
* File systems which don't support sparse files zero on every
* extend.
*/
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
return 0;
pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
if (ret) {
mlog_errno(ret);
goto out;
}
if (numpages == 0)
goto out;
ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
handle);
/*
* Initiate writeout of the pages we zero'd here. We don't
* wait on them - the truncate_inode_pages() call later will
* do that for us.
*/
endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
endbyte - 1, SYNC_FILE_RANGE_WRITE);
if (ret)
mlog_errno(ret);
out:
if (pages)
kfree(pages);
return ret;
} }
/* /*
...@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct ocfs2_truncate_context *tc) struct ocfs2_truncate_context *tc)
{ {
int status, i, credits, tl_sem = 0; int status, i, credits, tl_sem = 0;
u32 clusters_to_del, target_i_clusters; u32 clusters_to_del, new_highest_cpos, range;
u64 last_eb = 0;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el; struct ocfs2_extent_list *el;
struct buffer_head *last_eb_bh;
handle_t *handle = NULL; handle_t *handle = NULL;
struct inode *tl_inode = osb->osb_tl_inode; struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_path *path = NULL;
mlog_entry_void(); mlog_entry_void();
down_write(&OCFS2_I(inode)->ip_alloc_sem); down_write(&OCFS2_I(inode)->ip_alloc_sem);
target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode)); i_size_read(inode));
last_eb_bh = tc->tc_last_eb_bh; path = ocfs2_new_inode_path(fe_bh);
tc->tc_last_eb_bh = NULL; if (!path) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) fe_bh->b_data; ocfs2_extent_map_trunc(inode, new_highest_cpos);
if (fe->id2.i_list.l_tree_depth) {
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
el = &eb->h_list;
} else
el = &fe->id2.i_list;
last_eb = le64_to_cpu(fe->i_last_eb_blk);
start: start:
mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " /*
"last_eb = %llu, fe->i_last_eb_blk = %llu, " * Check that we still have allocation to delete.
"fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", */
le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, if (OCFS2_I(inode)->ip_clusters == 0) {
(unsigned long long)le64_to_cpu(fe->i_last_eb_blk), status = 0;
le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); goto bail;
if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
mlog(0, "last_eb changed!\n");
BUG_ON(!fe->id2.i_list.l_tree_depth);
last_eb = le64_to_cpu(fe->i_last_eb_blk);
/* i_last_eb_blk may have changed, read it if
* necessary. We don't have to worry about the
* truncate to zero case here (where there becomes no
* last_eb) because we never loop back after our work
* is done. */
if (last_eb_bh) {
brelse(last_eb_bh);
last_eb_bh = NULL;
} }
status = ocfs2_read_block(osb, last_eb, /*
&last_eb_bh, OCFS2_BH_CACHED, * Truncate always works against the rightmost tree branch.
inode); */
if (status < 0) { status = ocfs2_find_path(inode, path, UINT_MAX);
if (status) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
status = -EIO;
/*
* By now, el will point to the extent list on the bottom most
* portion of this tree. Only the tail record is considered in
* each pass.
*
* We handle the following cases, in order:
* - empty extent: delete the remaining branch
* - remove the entire record
* - remove a partial record
* - no record needs to be removed (truncate has completed)
*/
el = path_leaf_el(path);
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(inode->i_sb,
"Inode %llu has empty extent block at %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)path_leaf_bh(path)->b_blocknr);
status = -EROFS;
goto bail; goto bail;
} }
el = &(eb->h_list);
}
/* by now, el will point to the extent list on the bottom most
* portion of this tree. */
i = le16_to_cpu(el->l_next_free_rec) - 1; i = le16_to_cpu(el->l_next_free_rec) - 1;
if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) range = le32_to_cpu(el->l_recs[i].e_cpos) +
clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); ocfs2_rec_clusters(el, &el->l_recs[i]);
else if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + clusters_to_del = 0;
} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
} else if (range > new_highest_cpos) {
clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
le32_to_cpu(el->l_recs[i].e_cpos)) - le32_to_cpu(el->l_recs[i].e_cpos)) -
target_i_clusters; new_highest_cpos;
} else {
status = 0;
goto bail;
}
mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); BUG_ON(clusters_to_del == 0);
mutex_lock(&tl_inode->i_mutex); mutex_lock(&tl_inode->i_mutex);
tl_sem = 1; tl_sem = 1;
...@@ -1861,7 +3722,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1861,7 +3722,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
} }
credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
fe, el); (struct ocfs2_dinode *)fe_bh->b_data,
el);
handle = ocfs2_start_trans(osb, credits); handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) { if (IS_ERR(handle)) {
status = PTR_ERR(handle); status = PTR_ERR(handle);
...@@ -1870,13 +3732,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1870,13 +3732,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
goto bail; goto bail;
} }
inode->i_ctime = inode->i_mtime = CURRENT_TIME; status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); tc, path);
if (status < 0)
mlog_errno(status);
status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
last_eb_bh, handle, tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
...@@ -1888,9 +3745,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1888,9 +3745,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
handle = NULL; handle = NULL;
BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); ocfs2_reinit_path(path, 1);
if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
/*
* The check above will catch the case where we've truncated
* away all allocation.
*/
goto start; goto start;
bail: bail:
up_write(&OCFS2_I(inode)->ip_alloc_sem); up_write(&OCFS2_I(inode)->ip_alloc_sem);
...@@ -1902,8 +3764,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1902,8 +3764,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
if (handle) if (handle)
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
if (last_eb_bh) ocfs2_free_path(path);
brelse(last_eb_bh);
/* This will drop the ext_alloc cluster lock for us */ /* This will drop the ext_alloc cluster lock for us */
ocfs2_free_truncate_context(tc); ocfs2_free_truncate_context(tc);
...@@ -1912,7 +3773,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -1912,7 +3773,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
return status; return status;
} }
/* /*
* Expects the inode to already be locked. This will figure out which * Expects the inode to already be locked. This will figure out which
* inodes need to be locked and will put them on the returned truncate * inodes need to be locked and will put them on the returned truncate
...@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ...@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc) struct ocfs2_truncate_context **tc)
{ {
int status, metadata_delete; int status, metadata_delete, i;
unsigned int new_i_clusters; unsigned int new_i_clusters;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb; struct ocfs2_extent_block *eb;
...@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ...@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
"%llu\n", fe->i_clusters, new_i_clusters, "%llu\n", fe->i_clusters, new_i_clusters,
(unsigned long long)fe->i_size); (unsigned long long)fe->i_size);
if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
"%u and size %llu whereas struct inode has "
"cluster count %u and size %llu which caused an "
"invalid truncate to %u clusters.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->i_clusters),
(unsigned long long)le64_to_cpu(fe->i_size),
OCFS2_I(inode)->ip_clusters, i_size_read(inode),
new_i_clusters);
mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
status = -EIO;
goto bail;
}
*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
if (!(*tc)) { if (!(*tc)) {
status = -ENOMEM; status = -ENOMEM;
...@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ...@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
goto bail; goto bail;
} }
el = &(eb->h_list); el = &(eb->h_list);
if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
i = 0;
if (ocfs2_is_empty_extent(&el->l_recs[0]))
i = 1;
/*
* XXX: Should we check that next_free_rec contains
* the extent?
*/
if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
metadata_delete = 1; metadata_delete = 1;
} }
......
...@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, ...@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle, handle_t *handle,
struct inode *inode, struct inode *inode,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
u64 blkno, u32 cpos,
u64 start_blk,
u32 new_clusters, u32 new_clusters,
struct ocfs2_alloc_context *meta_ac); struct ocfs2_alloc_context *meta_ac);
int ocfs2_num_free_extents(struct ocfs2_super *osb, int ocfs2_num_free_extents(struct ocfs2_super *osb,
...@@ -70,6 +71,8 @@ struct ocfs2_truncate_context { ...@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
struct buffer_head *tc_last_eb_bh; struct buffer_head *tc_last_eb_bh;
}; };
int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb, int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode, struct inode *inode,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
...@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, ...@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
struct ocfs2_truncate_context *tc); struct ocfs2_truncate_context *tc);
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh);
/*
* Helper function to look at the # of clusters in an extent record.
*/
static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *rec)
{
/*
* Cluster count in extent records is slightly different
* between interior nodes and leaf nodes. This is to support
* unwritten extents which need a flags field in leaf node
* records, thus shrinking the available space for a clusters
* field.
*/
if (el->l_tree_depth)
return le32_to_cpu(rec->e_int_clusters);
else
return le16_to_cpu(rec->e_leaf_clusters);
}
#endif /* OCFS2_ALLOC_H */ #endif /* OCFS2_ALLOC_H */
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <asm/byteorder.h> #include <asm/byteorder.h>
#include <linux/swap.h>
#include <linux/pipe_fs_i.h>
#define MLOG_MASK_PREFIX ML_FILE_IO #define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h> #include <cluster/masklog.h>
...@@ -37,6 +39,7 @@ ...@@ -37,6 +39,7 @@
#include "file.h" #include "file.h"
#include "inode.h" #include "inode.h"
#include "journal.h" #include "journal.h"
#include "suballoc.h"
#include "super.h" #include "super.h"
#include "symlink.h" #include "symlink.h"
...@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, ...@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create)
{ {
int err = 0; int err = 0;
unsigned int ext_flags;
u64 p_blkno, past_eof; u64 p_blkno, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
(unsigned long long)iblock, bh_result, create); (unsigned long long)iblock, bh_result, create);
...@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, ...@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail; goto bail;
} }
/* this can happen if another node truncs after our extend! */ err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
spin_lock(&OCFS2_I(inode)->ip_lock); &ext_flags);
if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
OCFS2_I(inode)->ip_clusters))
err = -EIO;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (err)
goto bail;
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
NULL);
if (err) { if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%llu, NULL)\n", err, inode, (unsigned long long)iblock, "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
...@@ -167,14 +163,30 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, ...@@ -167,14 +163,30 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail; goto bail;
} }
/*
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
* allows block_prepare_write() to zero.
*/
mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
"ino %lu, iblock %llu\n", inode->i_ino,
(unsigned long long)iblock);
/* Treat the unwritten extent as a hole for zeroing purposes. */
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno); map_bh(bh_result, inode->i_sb, p_blkno);
if (bh_result->b_blocknr == 0) { if (!ocfs2_sparse_alloc(osb)) {
if (p_blkno == 0) {
err = -EIO; err = -EIO;
mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", mlog(ML_ERROR,
"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
(unsigned long long)iblock, (unsigned long long)iblock,
(unsigned long long)p_blkno, (unsigned long long)p_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno); (unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
dump_stack();
} }
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
...@@ -183,6 +195,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, ...@@ -183,6 +195,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
if (create && (iblock >= past_eof)) if (create && (iblock >= past_eof))
set_buffer_new(bh_result); set_buffer_new(bh_result);
}
bail: bail:
if (err < 0) if (err < 0)
...@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) ...@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
return ret; return ret;
} }
/* This can also be called from ocfs2_write_zero_page() which has done /*
* it's own cluster locking. */ * This is called from ocfs2_write_zero_page() which has handled it's
* own cluster locking and has ensured allocation exists for those
* blocks to be written.
*/
int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
unsigned from, unsigned to) unsigned from, unsigned to)
{ {
...@@ -292,38 +308,11 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, ...@@ -292,38 +308,11 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
return ret; return ret;
} }
/*
* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
* from loopback. It must be able to perform its own locking around
* ocfs2_get_block().
*/
static int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
int ret;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_prepare_write_nolock(inode, page, from, to);
ocfs2_meta_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
}
/* Taken from ext3. We don't necessarily need the full blown /* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole * functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up * thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */ * their fixes when they happen) --Mark */
static int walk_page_buffers( handle_t *handle, int walk_page_buffers( handle_t *handle,
struct buffer_head *head, struct buffer_head *head,
unsigned from, unsigned from,
unsigned to, unsigned to,
...@@ -388,95 +377,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, ...@@ -388,95 +377,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
return handle; return handle;
} }
static int ocfs2_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
int ret;
struct buffer_head *di_bh = NULL;
struct inode *inode = page->mapping->host;
handle_t *handle = NULL;
struct ocfs2_dinode *di;
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
* us to continue here without rechecking the I/O against
* changed inode values.
*
* 1) We're currently holding the inode alloc lock, so no
* nodes can change it underneath us.
*
* 2) We've had to take the metadata lock at least once
* already to check for extending writes, suid removal, etc.
* The meta data update code then ensures that we don't get a
* stale inode allocation image (i_size, i_clusters, etc).
*/
ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
if (ret != 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_data_lock_with_page(inode, 1, page);
if (ret != 0) {
mlog_errno(ret);
goto out_unlock_meta;
}
handle = ocfs2_start_walk_page_trans(inode, page, from, to);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_unlock_data;
}
/* Mark our buffer early. We'd rather catch this error up here
* as opposed to after a successful commit_write which would
* require us to set back inode->i_size. */
ret = ocfs2_journal_access(handle, inode, di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
/* might update i_size */
ret = generic_commit_write(file, page, from, to);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
di = (struct ocfs2_dinode *)di_bh->b_data;
/* ocfs2_mark_inode_dirty() is too heavy to use here. */
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
di->i_size = cpu_to_le64((u64)i_size_read(inode));
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out_unlock_data:
ocfs2_data_unlock(inode, 1);
out_unlock_meta:
ocfs2_meta_unlock(inode, 1);
out:
if (di_bh)
brelse(di_bh);
mlog_exit(ret);
return ret;
}
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{ {
sector_t status; sector_t status;
...@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) ...@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
down_read(&OCFS2_I(inode)->ip_alloc_sem); down_read(&OCFS2_I(inode)->ip_alloc_sem);
} }
err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
NULL);
if (!INODE_JOURNAL(inode)) { if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem); up_read(&OCFS2_I(inode)->ip_alloc_sem);
...@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ...@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create)
{ {
int ret; int ret;
u64 p_blkno, inode_blocks; u64 p_blkno, inode_blocks, contig_blocks;
int contig_blocks; unsigned int ext_flags;
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
...@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ...@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
* nicely aligned and of the right size, so there's no need * nicely aligned and of the right size, so there's no need
* for us to check any of that. */ * for us to check any of that. */
spin_lock(&OCFS2_I(inode)->ip_lock); inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
OCFS2_I(inode)->ip_clusters);
/*
* For a read which begins past the end of file, we return a hole.
*/
if (!create && (iblock >= inode_blocks)) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = 0;
goto bail;
}
/* /*
* Any write past EOF is not allowed because we'd be extending. * Any write past EOF is not allowed because we'd be extending.
*/ */
if (create && (iblock + max_blocks) > inode_blocks) { if (create && (iblock + max_blocks) > inode_blocks) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = -EIO; ret = -EIO;
goto bail; goto bail;
} }
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* This figures out the size of the next contiguous block, and /* This figures out the size of the next contiguous block, and
* our logical offset */ * our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks); &contig_blocks, &ext_flags);
if (ret) { if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock); (unsigned long long)iblock);
...@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ...@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail; goto bail;
} }
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
ocfs2_error(inode->i_sb,
"Inode %llu has a hole at block %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)iblock);
ret = -EROFS;
goto bail;
}
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
*
* Consider an unwritten extent as a hole.
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno); map_bh(bh_result, inode->i_sb, p_blkno);
else {
/*
* ocfs2_prepare_inode_for_write() should have caught
* the case where we'd be filling a hole and triggered
* a buffered write instead.
*/
if (create) {
ret = -EIO;
mlog_errno(ret);
goto bail;
}
clear_buffer_mapped(bh_result);
}
/* make sure we don't map more than max_blocks blocks here as /* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */ that's all the kernel will handle at this point. */
...@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, ...@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
void *private) void *private)
{ {
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
/* this io's submitter should not have unlocked this before we could */ /* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
ocfs2_iocb_clear_rw_locked(iocb); ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
if (!level)
up_read(&inode->i_alloc_sem); up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, 0); ocfs2_rw_unlock(inode, level);
} }
/* /*
...@@ -647,16 +568,19 @@ static ssize_t ocfs2_direct_IO(int rw, ...@@ -647,16 +568,19 @@ static ssize_t ocfs2_direct_IO(int rw,
mlog_entry_void(); mlog_entry_void();
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
/* /*
* We get PR data locks even for O_DIRECT. This allows * We get PR data locks even for O_DIRECT. This
* concurrent O_DIRECT I/O but doesn't let O_DIRECT with * allows concurrent O_DIRECT I/O but doesn't let
* extending and buffered zeroing writes race. If they did * O_DIRECT with extending and buffered zeroing writes
* race then the buffered zeroing could be written back after * race. If they did race then the buffered zeroing
* the O_DIRECT I/O. It's one thing to tell people not to mix * could be written back after the O_DIRECT I/O. It's
* buffered and O_DIRECT writes, but expecting them to * one thing to tell people not to mix buffered and
* understand that file extension is also an implicit buffered * O_DIRECT writes, but expecting them to understand
* write is too much. By getting the PR we force writeback of * that file extension is also an implicit buffered
* the buffered zeroing before proceeding. * write is too much. By getting the PR we force
* writeback of the buffered zeroing before
* proceeding.
*/ */
ret = ocfs2_data_lock(inode, 0); ret = ocfs2_data_lock(inode, 0);
if (ret < 0) { if (ret < 0) {
...@@ -664,6 +588,7 @@ static ssize_t ocfs2_direct_IO(int rw, ...@@ -664,6 +588,7 @@ static ssize_t ocfs2_direct_IO(int rw,
goto out; goto out;
} }
ocfs2_data_unlock(inode, 0); ocfs2_data_unlock(inode, 0);
}
ret = blockdev_direct_IO_no_locking(rw, iocb, inode, ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset, inode->i_sb->s_bdev, iov, offset,
...@@ -675,11 +600,715 @@ static ssize_t ocfs2_direct_IO(int rw, ...@@ -675,11 +600,715 @@ static ssize_t ocfs2_direct_IO(int rw,
return ret; return ret;
} }
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
unsigned int *end)
{
unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
unsigned int cpp;
cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
cluster_start = cpos % cpp;
cluster_start = cluster_start << osb->s_clustersize_bits;
cluster_end = cluster_start + osb->s_clustersize;
}
BUG_ON(cluster_start > PAGE_SIZE);
BUG_ON(cluster_end > PAGE_SIZE);
if (start)
*start = cluster_start;
if (end)
*end = cluster_end;
}
/*
* 'from' and 'to' are the region in the page to avoid zeroing.
*
* If pagesize > clustersize, this function will avoid zeroing outside
* of the cluster boundary.
*
* from == to == 0 is code for "zero the entire cluster region"
*/
static void ocfs2_clear_page_regions(struct page *page,
struct ocfs2_super *osb, u32 cpos,
unsigned from, unsigned to)
{
void *kaddr;
unsigned int cluster_start, cluster_end;
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
kaddr = kmap_atomic(page, KM_USER0);
if (from || to) {
if (from > cluster_start)
memset(kaddr + cluster_start, 0, from - cluster_start);
if (to < cluster_end)
memset(kaddr + to, 0, cluster_end - to);
} else {
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
kunmap_atomic(kaddr, KM_USER0);
}
/*
* Some of this taken from block_prepare_write(). We already have our
* mapping by now though, and the entire write will be allocating or
* it won't, so not much need to use BH_New.
*
* This will also skip zeroing, which is handled externally.
*/
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new)
{
int ret = 0;
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
unsigned int block_end, block_start;
unsigned int bsize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, bsize, 0);
head = page_buffers(page);
for (bh = head, block_start = 0; bh != head || !block_start;
bh = bh->b_this_page, block_start += bsize) {
block_end = block_start + bsize;
/*
* Ignore blocks outside of our i/o range -
* they may belong to unallocated clusters.
*/
if (block_start >= to || block_end <= from) {
if (PageUptodate(page))
set_buffer_uptodate(bh);
continue;
}
/*
* For an allocating write with cluster size >= page
* size, we always write the entire page.
*/
if (buffer_new(bh))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(READ, 1, &bh);
*wait_bh++=bh;
}
*p_blkno = *p_blkno + 1;
}
/*
* If we issued read requests - let them complete.
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
ret = -EIO;
}
if (ret == 0 || !new)
return ret;
/*
* If we get -EIO above, zero out any newly allocated blocks
* to avoid exposing stale data.
*/
bh = head;
block_start = 0;
do {
void *kaddr;
block_end = block_start + bsize;
if (block_end <= from)
goto next_bh;
if (block_start >= to)
break;
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr+block_start, 0, bh->b_size);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
next_bh:
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
return ret;
}
/*
* This will copy user data from the buffer page in the splice
* context.
*
* For now, we ignore SPLICE_F_MOVE as that would require some extra
* communication out all the way to ocfs2_write().
*/
int ocfs2_map_and_write_splice_data(struct inode *inode,
struct ocfs2_write_ctxt *wc, u64 *p_blkno,
unsigned int *ret_from, unsigned int *ret_to)
{
int ret;
unsigned int to, from, cluster_start, cluster_end;
char *src, *dst;
struct ocfs2_splice_write_priv *sp = wc->w_private;
struct pipe_buffer *buf = sp->s_buf;
unsigned long bytes, src_from;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
&cluster_end);
from = sp->s_offset;
src_from = sp->s_buf_offset;
bytes = wc->w_count;
if (wc->w_large_pages) {
/*
* For cluster size < page size, we have to
* calculate pos within the cluster and obey
* the rightmost boundary.
*/
bytes = min(bytes, (unsigned long)(osb->s_clustersize
- (wc->w_pos & (osb->s_clustersize - 1))));
}
to = from + bytes;
if (wc->w_this_page_new)
ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
cluster_start, cluster_end, 1);
else
ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
from, to, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > osb->s_clustersize);
BUG_ON(to > osb->s_clustersize);
src = buf->ops->map(sp->s_pipe, buf, 1);
dst = kmap_atomic(wc->w_this_page, KM_USER1);
memcpy(dst + from, src + src_from, bytes);
kunmap_atomic(wc->w_this_page, KM_USER1);
buf->ops->unmap(sp->s_pipe, buf, src);
wc->w_finished_copy = 1;
*ret_from = from;
*ret_to = to;
out:
return bytes ? (unsigned int)bytes : ret;
}
/*
* This will copy user data from the iovec in the buffered write
* context.
*/
int ocfs2_map_and_write_user_data(struct inode *inode,
struct ocfs2_write_ctxt *wc, u64 *p_blkno,
unsigned int *ret_from, unsigned int *ret_to)
{
int ret;
unsigned int to, from, cluster_start, cluster_end;
unsigned long bytes, src_from;
char *dst;
struct ocfs2_buffered_write_priv *bp = wc->w_private;
const struct iovec *cur_iov = bp->b_cur_iov;
char __user *buf;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
&cluster_end);
buf = cur_iov->iov_base + bp->b_cur_off;
src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
/*
* This is a lot of comparisons, but it reads quite
* easily, which is important here.
*/
/* Stay within the src page */
bytes = PAGE_SIZE - src_from;
/* Stay within the vector */
bytes = min(bytes,
(unsigned long)(cur_iov->iov_len - bp->b_cur_off));
/* Stay within count */
bytes = min(bytes, (unsigned long)wc->w_count);
/*
* For clustersize > page size, just stay within
* target page, otherwise we have to calculate pos
* within the cluster and obey the rightmost
* boundary.
*/
if (wc->w_large_pages) {
/*
* For cluster size < page size, we have to
* calculate pos within the cluster and obey
* the rightmost boundary.
*/
bytes = min(bytes, (unsigned long)(osb->s_clustersize
- (wc->w_pos & (osb->s_clustersize - 1))));
} else {
/*
* cluster size > page size is the most common
* case - we just stay within the target page
* boundary.
*/
bytes = min(bytes, PAGE_CACHE_SIZE - from);
}
to = from + bytes;
if (wc->w_this_page_new)
ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
cluster_start, cluster_end, 1);
else
ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
from, to, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > osb->s_clustersize);
BUG_ON(to > osb->s_clustersize);
dst = kmap(wc->w_this_page);
memcpy(dst + from, bp->b_src_buf + src_from, bytes);
kunmap(wc->w_this_page);
/*
* XXX: This is slow, but simple. The caller of
* ocfs2_buffered_write_cluster() is responsible for
* passing through the iovecs, so it's difficult to
* predict what our next step is in here after our
* initial write. A future version should be pushing
* that iovec manipulation further down.
*
* By setting this, we indicate that a copy from user
* data was done, and subsequent calls for this
* cluster will skip copying more data.
*/
wc->w_finished_copy = 1;
*ret_from = from;
*ret_to = to;
out:
return bytes ? (unsigned int)bytes : ret;
}
/*
* Map, fill and write a page to disk.
*
* The work of copying data is done via callback. Newly allocated
* pages which don't take user data will be zero'd (set 'new' to
* indicate an allocating write)
*
* Returns a negative error code or the number of bytes copied into
* the page.
*/
int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
u64 *p_blkno, struct page *page,
struct ocfs2_write_ctxt *wc, int new)
{
int ret, copied = 0;
unsigned int from = 0, to = 0;
unsigned int cluster_start, cluster_end;
unsigned int zero_from = 0, zero_to = 0;
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
&cluster_start, &cluster_end);
if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
&& !wc->w_finished_copy) {
wc->w_this_page = page;
wc->w_this_page_new = new;
ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
copied = ret;
zero_from = from;
zero_to = to;
if (new) {
from = cluster_start;
to = cluster_end;
}
} else {
/*
* If we haven't allocated the new page yet, we
* shouldn't be writing it out without copying user
* data. This is likely a math error from the caller.
*/
BUG_ON(!new);
from = cluster_start;
to = cluster_end;
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
cluster_start, cluster_end, 1);
if (ret) {
mlog_errno(ret);
goto out;
}
}
/*
* Parts of newly allocated pages need to be zero'd.
*
* Above, we have also rewritten 'to' and 'from' - as far as
* the rest of the function is concerned, the entire cluster
* range inside of a page needs to be written.
*
* We can skip this if the page is up to date - it's already
* been zero'd from being read in as a hole.
*/
if (new && !PageUptodate(page))
ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
wc->w_cpos, zero_from, zero_to);
flush_dcache_page(page);
if (ocfs2_should_order_data(inode)) {
ret = walk_page_buffers(handle,
page_buffers(page),
from, to, NULL,
ocfs2_journal_dirty_data);
if (ret < 0)
mlog_errno(ret);
}
/*
* We don't use generic_commit_write() because we need to
* handle our own i_size update.
*/
ret = block_commit_write(page, from, to);
if (ret)
mlog_errno(ret);
out:
return copied ? copied : ret;
}
/*
* Do the actual write of some data into an inode. Optionally allocate
* in order to fulfill the write.
*
* cpos is the logical cluster offset within the file to write at
*
* 'phys' is the physical mapping of that offset. a 'phys' value of
* zero indicates that allocation is required. In this case, data_ac
* and meta_ac should be valid (meta_ac can be null if metadata
* allocation isn't required).
*/
static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
struct buffer_head *di_bh,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc)
{
int ret, i, numpages = 1, new;
unsigned int copied = 0;
u32 tmp_pos;
u64 v_blkno, p_blkno;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
unsigned long index, start;
struct page **cpages;
new = phys == 0 ? 1 : 0;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
* page. Otherwise, we'll need a whole clusters worth.
*/
if (new)
numpages = ocfs2_pages_per_cluster(inode->i_sb);
cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
if (!cpages) {
ret = -ENOMEM;
mlog_errno(ret);
return ret;
}
/*
* Fill our page array first. That way we've grabbed enough so
* that we can zero and flush if we error after adding the
* extent.
*/
if (new) {
start = ocfs2_align_clusters_to_page_index(inode->i_sb,
wc->w_cpos);
v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
} else {
start = wc->w_pos >> PAGE_CACHE_SHIFT;
v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
}
for(i = 0; i < numpages; i++) {
index = start + i;
cpages[i] = grab_cache_page(mapping, index);
if (!cpages[i]) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
}
if (new) {
/*
* This is safe to call with the page locks - it won't take
* any additional semaphores or cluster locks.
*/
tmp_pos = wc->w_cpos;
ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
&tmp_pos, 1, di_bh, handle,
data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
* internal tree allocation code should know how to increase
* transaction credits itself.
*
* If need be, we could handle -EAGAIN for a
* RESTART_TRANS here.
*/
mlog_bug_on_msg(ret == -EAGAIN,
"Inode %llu: EAGAIN return during allocation.\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
NULL);
if (ret < 0) {
/*
* XXX: Should we go readonly here?
*/
mlog_errno(ret);
goto out;
}
BUG_ON(p_blkno == 0);
for(i = 0; i < numpages; i++) {
ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
wc, new);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
copied += ret;
}
out:
for(i = 0; i < numpages; i++) {
unlock_page(cpages[i]);
mark_page_accessed(cpages[i]);
page_cache_release(cpages[i]);
}
kfree(cpages);
return copied ? copied : ret;
}
static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
struct ocfs2_super *osb, loff_t pos,
size_t count, ocfs2_page_writer *cb,
void *cb_priv)
{
wc->w_count = count;
wc->w_pos = pos;
wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
wc->w_finished_copy = 0;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
else
wc->w_large_pages = 0;
wc->w_write_data_page = cb;
wc->w_private = cb_priv;
}
/*
* Write a cluster to an inode. The cluster may not be allocated yet,
* in which case it will be. This only exists for buffered writes -
* O_DIRECT takes a more "traditional" path through the kernel.
*
* The caller is responsible for incrementing pos, written counts, etc
*
* For file systems that don't support sparse files, pre-allocation
* and page zeroing up until cpos should be done prior to this
* function call.
*
* Callers should be holding i_sem, and the rw cluster lock.
*
* Returns the number of user bytes written, or less than zero for
* error.
*/
ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
size_t count, ocfs2_page_writer *actor,
void *priv)
{
int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
ssize_t written = 0;
u32 phys;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
struct ocfs2_write_ctxt wc;
ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
ret = ocfs2_meta_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out;
}
di = (struct ocfs2_dinode *)di_bh->b_data;
/*
* Take alloc sem here to prevent concurrent lookups. That way
* the mapping, zeroing and tree manipulation within
* ocfs2_write() will be safe against ->readpage(). This
* should also serve to lock out allocation from a shared
* writeable region.
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
if (ret) {
mlog_errno(ret);
goto out_meta;
}
/* phys == 0 means that allocation is required. */
if (phys == 0) {
ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
if (ret) {
mlog_errno(ret);
goto out_meta;
}
credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
}
ret = ocfs2_data_lock(inode, 1);
if (ret) {
mlog_errno(ret);
goto out_meta;
}
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_data;
}
written = ocfs2_write(file, phys, handle, di_bh, data_ac,
meta_ac, &wc);
if (written < 0) {
ret = written;
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_journal_access(handle, inode, di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
pos += written;
if (pos > inode->i_size) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret)
mlog_errno(ret);
out_commit:
ocfs2_commit_trans(osb, handle);
out_data:
ocfs2_data_unlock(inode, 1);
out_meta:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 1);
out:
brelse(di_bh);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
return written ? written : ret;
}
const struct address_space_operations ocfs2_aops = { const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage, .readpage = ocfs2_readpage,
.writepage = ocfs2_writepage, .writepage = ocfs2_writepage,
.prepare_write = ocfs2_prepare_write,
.commit_write = ocfs2_commit_write,
.bmap = ocfs2_bmap, .bmap = ocfs2_bmap,
.sync_page = block_sync_page, .sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO, .direct_IO = ocfs2_direct_IO,
......
...@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, ...@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
unsigned from, unsigned from,
unsigned to); unsigned to);
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new);
int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
struct buffer_head *bh));
struct ocfs2_write_ctxt;
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
u64 *, unsigned int *, unsigned int *);
ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
size_t count, ocfs2_page_writer *actor,
void *priv);
struct ocfs2_write_ctxt {
size_t w_count;
loff_t w_pos;
u32 w_cpos;
unsigned int w_finished_copy;
/* This is true if page_size > cluster_size */
unsigned int w_large_pages;
/* Filler callback and private data */
ocfs2_page_writer *w_write_data_page;
void *w_private;
/* Only valid for the filler callback */
struct page *w_this_page;
unsigned int w_this_page_new;
};
struct ocfs2_buffered_write_priv {
char *b_src_buf;
const struct iovec *b_cur_iov; /* Current iovec */
size_t b_cur_off; /* Offset in the
* current iovec */
};
int ocfs2_map_and_write_user_data(struct inode *inode,
struct ocfs2_write_ctxt *wc,
u64 *p_blkno,
unsigned int *ret_from,
unsigned int *ret_to);
struct ocfs2_splice_write_priv {
struct splice_desc *s_sd;
struct pipe_buffer *s_buf;
struct pipe_inode_info *s_pipe;
/* Neither offset value is ever larger than one page */
unsigned int s_offset;
unsigned int s_buf_offset;
};
int ocfs2_map_and_write_splice_data(struct inode *inode,
struct ocfs2_write_ctxt *wc,
u64 *p_blkno,
unsigned int *ret_from,
unsigned int *ret_to);
/* all ocfs2_dio_end_io()'s fault */ /* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \ #define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private) test_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_rw_locked(iocb) \ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
set_bit(0, (unsigned long *)&iocb->private) {
set_bit(0, (unsigned long *)&iocb->private);
if (level)
set_bit(1, (unsigned long *)&iocb->private);
else
clear_bit(1, (unsigned long *)&iocb->private);
}
#define ocfs2_iocb_clear_rw_locked(iocb) \ #define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(0, (unsigned long *)&iocb->private) clear_bit(0, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(1, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */ #endif /* OCFS2_FILE_H */
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/reboot.h>
#include "heartbeat.h" #include "heartbeat.h"
#include "nodemanager.h" #include "nodemanager.h"
...@@ -72,7 +73,9 @@ static void o2quo_fence_self(void) ...@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
/* panic spins with interrupts enabled. with preempt /* panic spins with interrupts enabled. with preempt
* threads can still schedule, etc, etc */ * threads can still schedule, etc, etc */
o2hb_stop_all_regions(); o2hb_stop_all_regions();
panic("ocfs2 is very sorry to be fencing this system by panicing\n");
printk("ocfs2 is very sorry to be fencing this system by restarting\n");
emergency_restart();
} }
/* Indicate that a timeout occured on a hearbeat region write. The /* Indicate that a timeout occured on a hearbeat region write. The
......
...@@ -38,6 +38,9 @@ ...@@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should * locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't. * be somewhere else, I'm sure, but right now it isn't.
* *
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
* New in version 7: * New in version 7:
* - DLM join domain includes the live nodemap * - DLM join domain includes the live nodemap
* *
...@@ -57,7 +60,7 @@ ...@@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs * - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down * - introduction of "rw" lock and pushing meta/data locking down
*/ */
#define O2NET_PROTOCOL_VERSION 7ULL #define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake { struct o2net_handshake {
__be64 protocol_version; __be64 protocol_version;
__be64 connector_id; __be64 connector_id;
......
...@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb, ...@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
{ {
int status; int status;
int extend; int extend;
u64 p_blkno; u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock); spin_lock(&OCFS2_I(dir)->ip_lock);
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
spin_unlock(&OCFS2_I(dir)->ip_lock); spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) { if (extend) {
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, u32 offset = OCFS2_I(dir)->ip_clusters;
parent_fe_bh, handle,
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
1, parent_fe_bh, handle,
data_ac, meta_ac, NULL); data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN); BUG_ON(status == -EAGAIN);
if (status < 0) { if (status < 0) {
...@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb, ...@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
} }
} }
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
(sb->s_blocksize_bits - 9)), status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
1, &p_blkno, NULL);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
...@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, ...@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
dir_i_size += dir->i_sb->s_blocksize; dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size); i_size_write(dir, dir_i_size);
dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); dir->i_blocks = ocfs2_inode_sector_count(dir);
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
......
...@@ -430,11 +430,10 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) ...@@ -430,11 +430,10 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
dlm_lockres_put(res); dlm_lockres_put(res);
cond_resched_lock(&dlm->spinlock);
if (dropped) if (dropped)
goto redo_bucket; goto redo_bucket;
} }
cond_resched_lock(&dlm->spinlock);
num += n; num += n;
mlog(0, "%s: touched %d lockreses in bucket %d " mlog(0, "%s: touched %d lockreses in bucket %d "
"(tot=%d)\n", dlm->name, n, i, num); "(tot=%d)\n", dlm->name, n, i, num);
...@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) ...@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
{ {
int status = 0, tmpstat, node; int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt; struct domain_join_ctxt *ctxt;
enum dlm_query_join_response response; enum dlm_query_join_response response = JOIN_DISALLOW;
mlog_entry("%p", dlm); mlog_entry("%p", dlm);
......
...@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) ...@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
} }
} while (status != 0); } while (status != 0);
spin_lock(&dlm_reco_state_lock);
switch (ndata->state) { switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_INIT:
case DLM_RECO_NODE_DATA_FINALIZE_SENT: case DLM_RECO_NODE_DATA_FINALIZE_SENT:
...@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) ...@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
ndata->node_num, dead_node); ndata->node_num, dead_node);
break; break;
} }
spin_unlock(&dlm_reco_state_lock);
} }
mlog(0, "done requesting all lock info\n"); mlog(0, "done requesting all lock info\n");
......
...@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { ...@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0, .flags = 0,
}; };
static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{ {
return lockres->l_type == OCFS2_LOCK_TYPE_META || return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA || lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
lockres->l_type == OCFS2_LOCK_TYPE_RW; lockres->l_type == OCFS2_LOCK_TYPE_RW ||
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
} }
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
...@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, ...@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA: case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops; ops = &ocfs2_inode_data_lops;
break; break;
case OCFS2_LOCK_TYPE_OPEN:
ops = &ocfs2_inode_open_lops;
break;
default: default:
mlog_bug_on_msg(1, "type: %d\n", type); mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */ ops = NULL; /* thanks, gcc */
...@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) ...@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail; goto bail;
} }
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
bail: bail:
mlog_exit(ret); mlog_exit(ret);
return ret; return ret;
...@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) ...@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void(); mlog_exit_void();
} }
/*
* ocfs2_open_lock always get PR mode lock.
*/
int ocfs2_open_lock(struct inode *inode)
{
int status = 0;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
mlog_entry_void();
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
if (ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE, 0, 0);
if (status < 0)
mlog_errno(status);
out:
mlog_exit(status);
return status;
}
int ocfs2_try_open_lock(struct inode *inode, int write)
{
int status = 0, level;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
mlog_entry_void();
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
if (ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
level = write ? LKM_EXMODE : LKM_PRMODE;
/*
* The file system may already holding a PRMODE/EXMODE open lock.
* Since we pass LKM_NOQUEUE, the request won't block waiting on
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
level, LKM_NOQUEUE, 0);
out:
mlog_exit(status);
return status;
}
/*
* ocfs2_open_unlock unlock PR and EX mode open locks.
*/
void ocfs2_open_unlock(struct inode *inode)
{
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
mlog(0, "inode %llu drop open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
if (ocfs2_mount_local(osb))
goto out;
if(lockres->l_ro_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE);
if(lockres->l_ex_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_EXMODE);
out:
mlog_exit_void();
}
int ocfs2_data_lock_full(struct inode *inode, int ocfs2_data_lock_full(struct inode *inode,
int write, int write,
int arg_flags) int arg_flags)
...@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) ...@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
inode->i_blocks = 0; inode->i_blocks = 0;
else else
inode->i_blocks = inode->i_blocks = ocfs2_inode_sector_count(inode);
ocfs2_align_bytes_to_sectors(i_size_read(inode));
inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
inode->i_gid = be32_to_cpu(lvb->lvb_igid); inode->i_gid = be32_to_cpu(lvb->lvb_igid);
...@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode, ...@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
{ {
int status = 0; int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = NULL; struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void(); mlog_entry_void();
if (ocfs2_mount_local(osb))
goto bail;
spin_lock(&oi->ip_lock); spin_lock(&oi->ip_lock);
if (oi->ip_flags & OCFS2_INODE_DELETED) { if (oi->ip_flags & OCFS2_INODE_DELETED) {
mlog(0, "Orphaned inode %llu was deleted while we " mlog(0, "Orphaned inode %llu was deleted while we "
...@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode, ...@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
} }
spin_unlock(&oi->ip_lock); spin_unlock(&oi->ip_lock);
if (!ocfs2_mount_local(osb)) {
lockres = &oi->ip_meta_lockres;
if (!ocfs2_should_refresh_lock_res(lockres)) if (!ocfs2_should_refresh_lock_res(lockres))
goto bail; goto bail;
}
/* This will discard any caching information we might have had /* This will discard any caching information we might have had
* for the inode metadata. */ * for the inode metadata. */
ocfs2_metadata_cache_purge(inode); ocfs2_metadata_cache_purge(inode);
/* will do nothing for inode types that don't use the extent
* map (directories, bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0); ocfs2_extent_map_trunc(inode, 0);
if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n", mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno); (unsigned long long)oi->ip_blkno);
ocfs2_refresh_inode_from_lvb(inode); ocfs2_refresh_inode_from_lvb(inode);
...@@ -1558,7 +1662,6 @@ static int ocfs2_meta_lock_update(struct inode *inode, ...@@ -1558,7 +1662,6 @@ static int ocfs2_meta_lock_update(struct inode *inode,
status = 0; status = 0;
bail_refresh: bail_refresh:
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, status); ocfs2_complete_lock_res_refresh(lockres, status);
bail: bail:
mlog_exit(status); mlog_exit(status);
...@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode, ...@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
wait_event(osb->recovery_event, wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map)); ocfs2_node_map_is_empty(osb, &osb->recovery_map));
acquired = 0;
lockres = &OCFS2_I(inode)->ip_meta_lockres; lockres = &OCFS2_I(inode)->ip_meta_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE; level = ex ? LKM_EXMODE : LKM_PRMODE;
dlm_flags = 0; dlm_flags = 0;
...@@ -2458,12 +2560,19 @@ int ocfs2_drop_inode_locks(struct inode *inode) ...@@ -2458,12 +2560,19 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */ * ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres); &OCFS2_I(inode)->ip_open_lockres);
if (err < 0) if (err < 0)
mlog_errno(err); mlog_errno(err);
status = err; status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres); &OCFS2_I(inode)->ip_meta_lockres);
if (err < 0) if (err < 0)
......
...@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, ...@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write); int write);
int ocfs2_rw_lock(struct inode *inode, int write); int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode, int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt, struct vfsmount *vfsmnt,
int *level); int *level);
......
...@@ -3,8 +3,7 @@ ...@@ -3,8 +3,7 @@
* *
* extent_map.c * extent_map.c
* *
* In-memory extent map for OCFS2. Man, this code was prettier in * Block/Cluster mapping functions
* the library.
* *
* Copyright (C) 2004 Oracle. All rights reserved. * Copyright (C) 2004 Oracle. All rights reserved.
* *
...@@ -26,1016 +25,528 @@ ...@@ -26,1016 +25,528 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP #define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h> #include <cluster/masklog.h>
#include "ocfs2.h" #include "ocfs2.h"
#include "alloc.h"
#include "extent_map.h" #include "extent_map.h"
#include "inode.h" #include "inode.h"
#include "super.h" #include "super.h"
#include "buffer_head_io.h" #include "buffer_head_io.h"
/*
* SUCK SUCK SUCK
* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
*/
struct ocfs2_extent_map_entry {
struct rb_node e_node;
int e_tree_depth;
struct ocfs2_extent_rec e_rec;
};
struct ocfs2_em_insert_context {
int need_left;
int need_right;
struct ocfs2_extent_map_entry *new_ent;
struct ocfs2_extent_map_entry *old_ent;
struct ocfs2_extent_map_entry *left_ent;
struct ocfs2_extent_map_entry *right_ent;
};
static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent);
static int ocfs2_extent_map_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth);
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
struct ocfs2_extent_map_entry *ent);
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el);
static int ocfs2_extent_map_lookup_read(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_map_entry **ret_ent);
static int ocfs2_extent_map_try_insert(struct inode *inode,
struct ocfs2_extent_rec *rec,
int tree_depth,
struct ocfs2_em_insert_context *ctxt);
/* returns 1 only if the rec contains all the given clusters -- that is that
* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
* clusters) is >= the argument's endpoint */
static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
u32 cpos, u32 clusters)
{
if (le32_to_cpu(rec->e_cpos) > cpos)
return 0;
if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters))
return 0;
return 1;
}
/* /*
* Find an entry in the tree that intersects the region passed in. * The extent caching implementation is intentionally trivial.
* Note that this will find straddled intervals, it is up to the
* callers to enforce any boundary conditions.
*
* Callers must hold ip_lock. This lookup is not guaranteed to return
* a tree_depth 0 match, and as such can race inserts if the lock
* were not held.
* *
* The rb_node garbage lets insertion share the search. Trivial * We only cache a small number of extents stored directly on the
* callers pass NULL. * inode, so linear order operations are acceptable. If we ever want
* to increase the size of the extent map, then these algorithms must
* get smarter.
*/ */
static struct ocfs2_extent_map_entry *
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, void ocfs2_extent_map_init(struct inode *inode)
u32 cpos, u32 clusters,
struct rb_node ***ret_p,
struct rb_node **ret_parent)
{ {
struct rb_node **p = &em->em_extents.rb_node; struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct rb_node *parent = NULL;
struct ocfs2_extent_map_entry *ent = NULL;
while (*p)
{
parent = *p;
ent = rb_entry(parent, struct ocfs2_extent_map_entry,
e_node);
if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
p = &(*p)->rb_left;
ent = NULL;
} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters))) {
p = &(*p)->rb_right;
ent = NULL;
} else
break;
}
if (ret_p != NULL) oi->ip_extent_map.em_num_items = 0;
*ret_p = p; INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
if (ret_parent != NULL)
*ret_parent = parent;
return ent;
} }
/* static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
* Find the leaf containing the interval we want. While we're on our unsigned int cpos,
* way down the tree, fill in every record we see at any depth, because struct ocfs2_extent_map_item **ret_emi)
* we might want it later.
*
* Note that this code is run without ip_lock. That's because it
* sleeps while reading. If someone is also filling the extent list at
* the same time we are, we might have to restart.
*/
static int ocfs2_extent_map_find_leaf(struct inode *inode,
u32 cpos, u32 clusters,
struct ocfs2_extent_list *el)
{ {
int i, ret; unsigned int range;
struct buffer_head *eb_bh = NULL; struct ocfs2_extent_map_item *emi;
u64 blkno;
u32 rec_end;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec;
/*
* The bh data containing the el cannot change here, because
* we hold alloc_sem. So we can do this without other
* locks.
*/
while (el->l_tree_depth)
{
blkno = 0;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
rec_end = (le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ret = -EBADR;
if (rec_end > OCFS2_I(inode)->ip_clusters) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
i,
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
goto out_free;
}
if (rec_end <= cpos) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
ret = ocfs2_extent_map_insert(inode, rec,
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
continue;
}
/*
* We've found a record that matches our
* interval. We don't insert it because we're
* about to traverse it.
*/
/* Check to see if we're stradling */
ret = -ESRCH;
if (!ocfs2_extent_rec_contains_clusters(rec,
cpos,
clusters)) {
mlog_errno(ret);
goto out_free;
}
/* *ret_emi = NULL;
* If we've already found a record, the el has
* two records covering the same interval.
* EEEK!
*/
ret = -EBADR;
if (blkno) {
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)blkno, i,
(unsigned long long)le64_to_cpu(rec->e_blkno));
goto out_free;
}
blkno = le64_to_cpu(rec->e_blkno); list_for_each_entry(emi, &em->em_list, ei_list) {
} range = emi->ei_cpos + emi->ei_clusters;
/* if (cpos >= emi->ei_cpos && cpos < range) {
* We don't support holes, and we're still up list_move(&emi->ei_list, &em->em_list);
* in the branches, so we'd better have found someone
*/
ret = -EBADR;
if (!blkno) {
ocfs2_error(inode->i_sb,
"No record found for (cpos = %u, clusters = %u) on inode %llu\n",
cpos, clusters,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_errno(ret);
goto out_free;
}
if (eb_bh) { *ret_emi = emi;
brelse(eb_bh); break;
eb_bh = NULL;
}
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
blkno, &eb_bh, OCFS2_BH_CACHED,
inode);
if (ret) {
mlog_errno(ret);
goto out_free;
}
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
ret = -EIO;
goto out_free;
} }
el = &eb->h_list;
} }
}
BUG_ON(el->l_tree_depth); static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
unsigned int *phys, unsigned int *len,
unsigned int *flags)
{
unsigned int coff;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map_item *emi;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { spin_lock(&oi->ip_lock);
rec = &el->l_recs[i];
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
OCFS2_I(inode)->ip_clusters) { if (emi) {
ret = -EBADR; coff = cpos - emi->ei_cpos;
mlog_errno(ret); *phys = emi->ei_phys + coff;
ocfs2_error(inode->i_sb, if (len)
"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", *len = emi->ei_clusters - coff;
i, if (flags)
(unsigned long long)le64_to_cpu(rec->e_blkno), *flags = emi->ei_flags;
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_clusters);
return ret;
} }
ret = ocfs2_extent_map_insert(inode, rec, spin_unlock(&oi->ip_lock);
le16_to_cpu(el->l_tree_depth));
if (ret && (ret != -EEXIST)) {
mlog_errno(ret);
goto out_free;
}
}
ret = 0; if (emi == NULL)
return -ENOENT;
out_free:
if (eb_bh)
brelse(eb_bh);
return ret; return 0;
} }
/* /*
* This lookup actually will read from disk. It has one invariant: * Forget about all clusters equal to or greater than cpos.
* It will never re-traverse blocks. This means that all inserts should
* be new regions or more granular regions (both allowed by insert).
*/ */
static int ocfs2_extent_map_lookup_read(struct inode *inode, void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
u32 cpos,
u32 clusters,
struct ocfs2_extent_map_entry **ret_ent)
{ {
int ret; struct list_head *p, *n;
u64 blkno; struct ocfs2_extent_map_item *emi;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map_entry *ent; struct ocfs2_extent_map *em = &oi->ip_extent_map;
struct buffer_head *bh = NULL; LIST_HEAD(tmp_list);
struct ocfs2_extent_block *eb; unsigned int range;
struct ocfs2_dinode *di;
struct ocfs2_extent_list *el; spin_lock(&oi->ip_lock);
list_for_each_safe(p, n, &em->em_list) {
spin_lock(&OCFS2_I(inode)->ip_lock); emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (ent) { if (emi->ei_cpos >= cpos) {
if (!ent->e_tree_depth) { /* Full truncate of this record. */
spin_unlock(&OCFS2_I(inode)->ip_lock); list_move(&emi->ei_list, &tmp_list);
*ret_ent = ent; BUG_ON(em->em_num_items == 0);
return 0; em->em_num_items--;
continue;
} }
blkno = le64_to_cpu(ent->e_rec.e_blkno);
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, range = emi->ei_cpos + emi->ei_clusters;
OCFS2_BH_CACHED, inode); if (range > cpos) {
if (ret) { /* Partial truncate */
mlog_errno(ret); emi->ei_clusters = cpos - emi->ei_cpos;
if (bh)
brelse(bh);
return ret;
} }
eb = (struct ocfs2_extent_block *)bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
brelse(bh);
return -EIO;
} }
el = &eb->h_list; spin_unlock(&oi->ip_lock);
} else {
spin_unlock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), list_for_each_safe(p, n, &tmp_list) {
OCFS2_I(inode)->ip_blkno, &bh, emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
OCFS2_BH_CACHED, inode); list_del(&emi->ei_list);
if (ret) { kfree(emi);
mlog_errno(ret);
if (bh)
brelse(bh);
return ret;
}
di = (struct ocfs2_dinode *)bh->b_data;
if (!OCFS2_IS_VALID_DINODE(di)) {
brelse(bh);
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
return -EIO;
}
el = &di->id2.i_list;
} }
ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
brelse(bh);
if (ret) {
mlog_errno(ret);
return ret;
}
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
if (!ent) {
ret = -ESRCH;
mlog_errno(ret);
return ret;
}
/* FIXME: Make sure this isn't a corruption */
BUG_ON(ent->e_tree_depth);
*ret_ent = ent;
return 0;
} }
/* /*
* Callers must hold ip_lock. This can insert pieces of the tree, * Is any part of emi2 contained within emi1
* thus racing lookup if the lock weren't held.
*/ */
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
struct ocfs2_extent_map_entry *ent) struct ocfs2_extent_map_item *emi2)
{ {
struct rb_node **p, *parent; unsigned int range1, range2;
struct ocfs2_extent_map_entry *old_ent;
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), /*
le32_to_cpu(ent->e_rec.e_clusters), * Check if logical start of emi2 is inside emi1
&p, &parent); */
if (old_ent) range1 = emi1->ei_cpos + emi1->ei_clusters;
return -EEXIST; if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
return 1;
rb_link_node(&ent->e_node, parent, p); /*
rb_insert_color(&ent->e_node, &em->em_extents); * Check if logical end of emi2 is inside emi1
*/
range2 = emi2->ei_cpos + emi2->ei_clusters;
if (range2 > emi1->ei_cpos && range2 <= range1)
return 1;
return 0; return 0;
} }
static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
struct ocfs2_extent_map_item *src)
{
dest->ei_cpos = src->ei_cpos;
dest->ei_phys = src->ei_phys;
dest->ei_clusters = src->ei_clusters;
dest->ei_flags = src->ei_flags;
}
/* /*
* Simple rule: on any return code other than -EAGAIN, anything left * Try to merge emi with ins. Returns 1 if merge succeeds, zero
* in the insert_context will be freed. * otherwise.
*
* Simple rule #2: A return code of -EEXIST from this function or
* its calls to ocfs2_extent_map_insert_entry() signifies that another
* thread beat us to the insert. It is not an actual error, but it
* tells the caller we have no more work to do.
*/ */
static int ocfs2_extent_map_try_insert(struct inode *inode, static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
struct ocfs2_extent_rec *rec, struct ocfs2_extent_map_item *ins)
int tree_depth,
struct ocfs2_em_insert_context *ctxt)
{ {
int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *old_ent;
ctxt->need_left = 0;
ctxt->need_right = 0;
ctxt->old_ent = NULL;
spin_lock(&OCFS2_I(inode)->ip_lock);
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
if (!ret) {
ctxt->new_ent = NULL;
goto out_unlock;
}
/* Since insert_entry failed, the map MUST have old_ent */
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
le32_to_cpu(rec->e_clusters),
NULL, NULL);
BUG_ON(!old_ent);
if (old_ent->e_tree_depth < tree_depth) {
/* Another thread beat us to the lower tree_depth */
ret = -EEXIST;
goto out_unlock;
}
if (old_ent->e_tree_depth == tree_depth) {
/* /*
* Another thread beat us to this tree_depth. * Handle contiguousness
* Let's make sure we agree with that thread (the
* extent_rec should be identical).
*/ */
if (!memcmp(rec, &old_ent->e_rec, if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
sizeof(struct ocfs2_extent_rec))) ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
ret = 0; ins->ei_flags == emi->ei_flags) {
else emi->ei_clusters += ins->ei_clusters;
/* FIXME: Should this be ESRCH/EBADR??? */ return 1;
ret = -EEXIST; } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
(ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
goto out_unlock; ins->ei_flags == emi->ei_flags) {
emi->ei_phys = ins->ei_phys;
emi->ei_cpos = ins->ei_cpos;
emi->ei_clusters += ins->ei_clusters;
return 1;
} }
/* /*
* We do it in this order specifically so that no actual tree * Overlapping extents - this shouldn't happen unless we've
* changes occur until we have all the pieces we need. We * split an extent to change it's flags. That is exceedingly
* don't want malloc failures to leave an inconsistent tree. * rare, so there's no sense in trying to optimize it yet.
* Whenever we drop the lock, another process could be
* inserting. Also note that, if another process just beat us
* to an insert, we might not need the same pieces we needed
* the first go round. In the end, the pieces we need will
* be used, and the pieces we don't will be freed.
*/ */
ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > if (ocfs2_ei_is_contained(emi, ins) ||
le32_to_cpu(old_ent->e_rec.e_cpos)); ocfs2_ei_is_contained(ins, emi)) {
ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + ocfs2_copy_emi_fields(emi, ins);
le32_to_cpu(old_ent->e_rec.e_clusters)) > return 1;
(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
ret = -EAGAIN;
if (ctxt->need_left) {
if (!ctxt->left_ent)
goto out_unlock;
*(ctxt->left_ent) = *old_ent;
ctxt->left_ent->e_rec.e_clusters =
cpu_to_le32(le32_to_cpu(rec->e_cpos) -
le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
}
if (ctxt->need_right) {
if (!ctxt->right_ent)
goto out_unlock;
*(ctxt->right_ent) = *old_ent;
ctxt->right_ent->e_rec.e_cpos =
cpu_to_le32(le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters));
ctxt->right_ent->e_rec.e_clusters =
cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
le32_to_cpu(old_ent->e_rec.e_clusters)) -
le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
}
rb_erase(&old_ent->e_node, &em->em_extents);
/* Now that he's erased, set him up for deletion */
ctxt->old_ent = old_ent;
if (ctxt->need_left) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->left_ent);
if (ret)
goto out_unlock;
ctxt->left_ent = NULL;
}
if (ctxt->need_right) {
ret = ocfs2_extent_map_insert_entry(em,
ctxt->right_ent);
if (ret)
goto out_unlock;
ctxt->right_ent = NULL;
} }
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); /* No merge was possible. */
return 0;
if (!ret)
ctxt->new_ent = NULL;
out_unlock:
spin_unlock(&OCFS2_I(inode)->ip_lock);
return ret;
} }
/*
static int ocfs2_extent_map_insert(struct inode *inode, * In order to reduce complexity on the caller, this insert function
struct ocfs2_extent_rec *rec, * is intentionally liberal in what it will accept.
int tree_depth) *
* The only rule is that the truncate call *must* be used whenever
* records have been deleted. This avoids inserting overlapping
* records with different physical mappings.
*/
void ocfs2_extent_map_insert_rec(struct inode *inode,
struct ocfs2_extent_rec *rec)
{ {
int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_em_insert_context ctxt = {0, }; struct ocfs2_extent_map *em = &oi->ip_extent_map;
struct ocfs2_extent_map_item *emi, *new_emi = NULL;
struct ocfs2_extent_map_item ins;
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > ins.ei_cpos = le32_to_cpu(rec->e_cpos);
OCFS2_I(inode)->ip_map.em_clusters) { ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
ret = -EBADR; le64_to_cpu(rec->e_blkno));
mlog_errno(ret); ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
return ret; ins.ei_flags = rec->e_flags;
}
/* Zero e_clusters means a truncated tail record. It better be EOF */ search:
if (!rec->e_clusters) { spin_lock(&oi->ip_lock);
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
OCFS2_I(inode)->ip_map.em_clusters) {
ret = -EBADR;
mlog_errno(ret);
ocfs2_error(inode->i_sb,
"Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
(unsigned long long)le64_to_cpu(rec->e_blkno),
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return ret;
}
/* Ignore the truncated tail */ list_for_each_entry(emi, &em->em_list, ei_list) {
return 0; if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
list_move(&emi->ei_list, &em->em_list);
spin_unlock(&oi->ip_lock);
goto out;
} }
ret = -ENOMEM;
ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.new_ent) {
mlog_errno(ret);
return ret;
} }
ctxt.new_ent->e_rec = *rec; /*
ctxt.new_ent->e_tree_depth = tree_depth; * No item could be merged.
*
* Either allocate and add a new item, or overwrite the last recently
* inserted.
*/
do { if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
ret = -ENOMEM; if (new_emi == NULL) {
if (ctxt.need_left && !ctxt.left_ent) { spin_unlock(&oi->ip_lock);
ctxt.left_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.left_ent)
break;
}
if (ctxt.need_right && !ctxt.right_ent) {
ctxt.right_ent =
kmem_cache_alloc(ocfs2_em_ent_cachep,
GFP_NOFS);
if (!ctxt.right_ent)
break;
}
ret = ocfs2_extent_map_try_insert(inode, rec, new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
tree_depth, &ctxt); if (new_emi == NULL)
} while (ret == -EAGAIN); goto out;
if ((ret < 0) && (ret != -EEXIST)) goto search;
mlog_errno(ret); }
ocfs2_copy_emi_fields(new_emi, &ins);
list_add(&new_emi->ei_list, &em->em_list);
em->em_num_items++;
new_emi = NULL;
} else {
BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
emi = list_entry(em->em_list.prev,
struct ocfs2_extent_map_item, ei_list);
list_move(&emi->ei_list, &em->em_list);
ocfs2_copy_emi_fields(emi, &ins);
}
if (ctxt.left_ent) spin_unlock(&oi->ip_lock);
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
if (ctxt.right_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
if (ctxt.old_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
if (ctxt.new_ent)
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
return ret; out:
if (new_emi)
kfree(new_emi);
} }
/* /*
* Append this record to the tail of the extent map. It must be * Return the 1st index within el which contains an extent start
* tree_depth 0. The record might be an extension of an existing * larger than v_cluster.
* record, and as such that needs to be handled. eg:
*
* Existing record in the extent map:
*
* cpos = 10, len = 10
* |---------|
*
* New Record:
*
* cpos = 10, len = 20
* |------------------|
*
* The passed record is the new on-disk record. The new_clusters value
* is how many clusters were added to the file. If the append is a
* contiguous append, the new_clusters has been added to
* rec->e_clusters. If the append is an entirely new extent, then
* rec->e_clusters is == new_clusters.
*/ */
int ocfs2_extent_map_append(struct inode *inode, static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
struct ocfs2_extent_rec *rec, u32 v_cluster)
u32 new_clusters)
{ {
int ret; int i;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_rec *rec;
struct ocfs2_extent_map_entry *ent;
struct ocfs2_extent_rec *old;
BUG_ON(!new_clusters);
BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
/* rec = &el->l_recs[i];
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + if (v_cluster < le32_to_cpu(rec->e_cpos))
le32_to_cpu(rec->e_clusters)) != break;
(em->em_clusters + new_clusters),
"Inode %llu:\n"
"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
"em->em_clusters = %u + new_clusters = %u = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
em->em_clusters, new_clusters,
em->em_clusters + new_clusters);
em->em_clusters += new_clusters;
ret = -ENOENT;
if (le32_to_cpu(rec->e_clusters) > new_clusters) {
/* This is a contiguous append */
ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
NULL, NULL);
if (ent) {
old = &ent->e_rec;
BUG_ON((le32_to_cpu(rec->e_cpos) +
le32_to_cpu(rec->e_clusters)) !=
(le32_to_cpu(old->e_cpos) +
le32_to_cpu(old->e_clusters) +
new_clusters));
if (ent->e_tree_depth == 0) {
BUG_ON(le32_to_cpu(old->e_cpos) !=
le32_to_cpu(rec->e_cpos));
BUG_ON(le64_to_cpu(old->e_blkno) !=
le64_to_cpu(rec->e_blkno));
ret = 0;
}
/*
* Let non-leafs fall through as -ENOENT to
* force insertion of the new leaf.
*/
le32_add_cpu(&old->e_clusters, new_clusters);
}
} }
if (ret == -ENOENT) return i;
ret = ocfs2_extent_map_insert(inode, rec, 0);
if (ret < 0)
mlog_errno(ret);
return ret;
} }
#if 0
/* Code here is included but defined out as it completes the extent
* map api and may be used in the future. */
/* /*
* Look up the record containing this cluster offset. This record is * Figure out the size of a hole which starts at v_cluster within the given
* part of the extent map. Do not free it. Any changes you make to * extent list.
* it will reflect in the extent map. So, if your last extent
* is (cpos = 10, clusters = 10) and you truncate the file by 5
* clusters, you can do:
* *
* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); * If there is no more allocation past v_cluster, we return the maximum
* rec->e_clusters -= 5; * cluster size minus v_cluster.
* *
* The lookup does not read from disk. If the map isn't filled in for * If we have in-inode extents, then el points to the dinode list and
* an entry, you won't find it. * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
* * containing el.
* Also note that the returned record is valid until alloc_sem is
* dropped. After that, truncate and extend can happen. Caveat Emptor.
*/ */
int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, static int ocfs2_figure_hole_clusters(struct inode *inode,
struct ocfs2_extent_rec **rec, struct ocfs2_extent_list *el,
int *tree_depth) struct buffer_head *eb_bh,
u32 v_cluster,
u32 *num_clusters)
{ {
int ret = -ENOENT; int ret, i;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct buffer_head *next_eb_bh = NULL;
struct ocfs2_extent_map_entry *ent; struct ocfs2_extent_block *eb, *next_eb;
*rec = NULL; i = ocfs2_search_for_hole_index(el, v_cluster);
if (cpos >= OCFS2_I(inode)->ip_clusters) if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
return -EINVAL; eb = (struct ocfs2_extent_block *)eb_bh->b_data;
if (cpos >= em->em_clusters) {
/* /*
* Size changed underneath us on disk. Drop any * Check the next leaf for any extents.
* straddling records and update our idea of
* i_clusters
*/ */
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters ;
}
ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
NULL, NULL); goto no_more_extents;
if (ent) { ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
*rec = &ent->e_rec; le64_to_cpu(eb->h_next_leaf_blk),
if (tree_depth) &next_eb_bh, OCFS2_BH_CACHED, inode);
*tree_depth = ent->e_tree_depth; if (ret) {
ret = 0; mlog_errno(ret);
goto out;
} }
next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
return ret; if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
} ret = -EROFS;
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
int ocfs2_extent_map_get_clusters(struct inode *inode, goto out;
u32 v_cpos, int count, }
u32 *p_cpos, int *ret_count)
{
int ret;
u32 coff, ccount;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent = NULL;
*p_cpos = ccount = 0; el = &next_eb->h_list;
if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) i = ocfs2_search_for_hole_index(el, v_cluster);
return -EINVAL; }
if ((v_cpos + count) > em->em_clusters) { no_more_extents:
if (i == le16_to_cpu(el->l_next_free_rec)) {
/* /*
* Size changed underneath us on disk. Drop any * We're at the end of our existing allocation. Just
* straddling records and update our idea of * return the maximum number of clusters we could
* i_clusters * possibly allocate.
*/ */
ocfs2_extent_map_drop(inode, em->em_clusters - 1); *num_clusters = UINT_MAX - v_cluster;
em->em_clusters = OCFS2_I(inode)->ip_clusters; } else {
*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
} }
ret = 0;
ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); out:
if (ret) brelse(next_eb_bh);
return ret; return ret;
}
if (ent) { /*
/* We should never find ourselves straddling an interval */ * Return the index of the extent record which contains cluster #v_cluster.
if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, * -1 is returned if it was not found.
v_cpos, *
count)) * Should work fine on interior and exterior nodes.
return -ESRCH; */
static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
u32 v_cluster)
{
int ret = -1;
int i;
struct ocfs2_extent_rec *rec;
u32 rec_end, rec_start, clusters;
coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, rec = &el->l_recs[i];
le64_to_cpu(ent->e_rec.e_blkno)) +
coff;
if (ret_count) rec_start = le32_to_cpu(rec->e_cpos);
*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; clusters = ocfs2_rec_clusters(el, rec);
return 0; rec_end = rec_start + clusters;
}
if (v_cluster >= rec_start && v_cluster < rec_end) {
ret = i;
break;
}
}
return -ENOENT; return ret;
} }
#endif /* 0 */ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
int ocfs2_extent_map_get_blocks(struct inode *inode, unsigned int *extent_flags)
u64 v_blkno, int count,
u64 *p_blkno, int *ret_count)
{ {
int ret; int ret, i;
u64 boff; unsigned int flags = 0;
u32 cpos, clusters; struct buffer_head *di_bh = NULL;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_map_entry *ent = NULL; struct ocfs2_dinode *di;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec; struct ocfs2_extent_rec *rec;
u32 coff;
*p_blkno = 0; ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
num_clusters, extent_flags);
if (ret == 0)
goto out;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
clusters = ocfs2_blocks_to_clusters(inode->i_sb, &di_bh, OCFS2_BH_CACHED, inode);
(u64)count + bpc - 1); if (ret) {
if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
ret = -EINVAL;
mlog_errno(ret); mlog_errno(ret);
return ret; goto out;
} }
if ((cpos + clusters) > em->em_clusters) { di = (struct ocfs2_dinode *) di_bh->b_data;
/* el = &di->id2.i_list;
* Size changed underneath us on disk. Drop any
* straddling records and update our idea of
* i_clusters
*/
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
em->em_clusters = OCFS2_I(inode)->ip_clusters;
}
ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); if (el->l_tree_depth) {
ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
if (ret) { if (ret) {
mlog_errno(ret); mlog_errno(ret);
return ret; goto out;
} }
if (ent) eb = (struct ocfs2_extent_block *) eb_bh->b_data;
{ el = &eb->h_list;
rec = &ent->e_rec;
/* We should never find ourselves straddling an interval */ if (el->l_tree_depth) {
if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { ocfs2_error(inode->i_sb,
ret = -ESRCH; "Inode %lu has non zero tree depth in "
mlog_errno(ret); "leaf block %llu\n", inode->i_ino,
return ret; (unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
} }
boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
le32_to_cpu(rec->e_cpos));
boff += (v_blkno & (u64)(bpc - 1));
*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(rec->e_clusters)) - boff;
} }
return 0; i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
* A hole was found. Return some canned values that
* callers can key on. If asked for, num_clusters will
* be populated with the size of the hole.
*/
*p_cluster = 0;
if (num_clusters) {
ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
v_cluster,
num_clusters);
if (ret) {
mlog_errno(ret);
goto out;
} }
}
} else {
rec = &el->l_recs[i];
return -ENOENT; BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
}
int ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
em->em_extents = RB_ROOT;
em->em_clusters = 0;
return 0;
}
/* Needs the lock */
static void __ocfs2_extent_map_drop(struct inode *inode,
u32 new_clusters,
struct rb_node **free_head,
struct ocfs2_extent_map_entry **tail_ent)
{
struct rb_node *node, *next;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
struct ocfs2_extent_map_entry *ent;
*free_head = NULL; if (!rec->e_blkno) {
ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
"record (%u, %u, 0)", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
ent = NULL; coff = v_cluster - le32_to_cpu(rec->e_cpos);
node = rb_last(&em->em_extents);
while (node)
{
next = rb_prev(node);
ent = rb_entry(node, struct ocfs2_extent_map_entry, *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
e_node); le64_to_cpu(rec->e_blkno));
if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) *p_cluster = *p_cluster + coff;
break;
rb_erase(&ent->e_node, &em->em_extents); if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
node->rb_right = *free_head; flags = rec->e_flags;
*free_head = node;
ent = NULL; ocfs2_extent_map_insert_rec(inode, rec);
node = next;
} }
/* Do we have an entry straddling new_clusters? */ if (extent_flags)
if (tail_ent) { *extent_flags = flags;
if (ent &&
((le32_to_cpu(ent->e_rec.e_cpos) +
le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
*tail_ent = ent;
else
*tail_ent = NULL;
}
}
static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) out:
{ brelse(di_bh);
struct rb_node *node; brelse(eb_bh);
struct ocfs2_extent_map_entry *ent; return ret;
while (free_head) {
node = free_head;
free_head = node->rb_right;
ent = rb_entry(node, struct ocfs2_extent_map_entry,
e_node);
kmem_cache_free(ocfs2_em_ent_cachep, ent);
}
} }
/* /*
* Remove all entries past new_clusters, inclusive of an entry that * This expects alloc_sem to be held. The allocation cannot change at
* contains new_clusters. This is effectively a cache forget. * all while the map is in the process of being updated.
*
* If you want to also clip the last extent by some number of clusters,
* you need to call ocfs2_extent_map_trunc().
* This code does not check or modify ip_clusters.
*/ */
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags)
{ {
struct rb_node *free_head = NULL; int ret;
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
struct ocfs2_extent_map_entry *ent; u32 cpos, num_clusters, p_cluster;
u64 boff = 0;
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
if (ent) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
rb_erase(&ent->e_node, &em->em_extents); extent_flags);
ent->e_node.rb_right = free_head; if (ret) {
free_head = &ent->e_node; mlog_errno(ret);
goto out;
} }
spin_unlock(&OCFS2_I(inode)->ip_lock); /*
* p_cluster == 0 indicates a hole.
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
/*
* Remove all entries past new_clusters and also clip any extent
* straddling new_clusters, if there is one. This does not check
* or modify ip_clusters
*/ */
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) if (p_cluster) {
{ boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
struct rb_node *free_head = NULL; boff += (v_blkno & (u64)(bpc - 1));
struct ocfs2_extent_map_entry *ent = NULL; }
spin_lock(&OCFS2_I(inode)->ip_lock);
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
if (ent)
ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
le32_to_cpu(ent->e_rec.e_cpos));
OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
spin_unlock(&OCFS2_I(inode)->ip_lock);
if (free_head)
__ocfs2_extent_map_drop_cleanup(free_head);
return 0;
}
int __init init_ocfs2_extent_maps(void) *p_blkno = boff;
{
ocfs2_em_ent_cachep =
kmem_cache_create("ocfs2_em_ent",
sizeof(struct ocfs2_extent_map_entry),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ocfs2_em_ent_cachep)
return -ENOMEM;
return 0; if (ret_count) {
} *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
*ret_count -= v_blkno & (u64)(bpc - 1);
}
void exit_ocfs2_extent_maps(void) out:
{ return ret;
kmem_cache_destroy(ocfs2_em_ent_cachep);
} }
...@@ -25,22 +25,29 @@ ...@@ -25,22 +25,29 @@
#ifndef _EXTENT_MAP_H #ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H #define _EXTENT_MAP_H
int init_ocfs2_extent_maps(void); struct ocfs2_extent_map_item {
void exit_ocfs2_extent_maps(void); unsigned int ei_cpos;
unsigned int ei_phys;
unsigned int ei_clusters;
unsigned int ei_flags;
/* struct list_head ei_list;
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem };
* to be held. The allocation cannot change at all while the map is
* in the process of being updated. #define OCFS2_MAX_EXTENT_MAP_ITEMS 3
*/ struct ocfs2_extent_map {
int ocfs2_extent_map_init(struct inode *inode); unsigned int em_num_items;
int ocfs2_extent_map_append(struct inode *inode, struct list_head em_list;
struct ocfs2_extent_rec *rec, };
u32 new_clusters);
int ocfs2_extent_map_get_blocks(struct inode *inode, void ocfs2_extent_map_init(struct inode *inode);
u64 v_blkno, int count, void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
u64 *p_blkno, int *ret_count); void ocfs2_extent_map_insert_rec(struct inode *inode,
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); struct ocfs2_extent_rec *rec);
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
u32 *num_clusters, unsigned int *extent_flags);
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags);
#endif /* _EXTENT_MAP_H */ #endif /* _EXTENT_MAP_H */
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/pipe_fs_i.h> #include <linux/pipe_fs_i.h>
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/writeback.h>
#define MLOG_MASK_PREFIX ML_INODE #define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h> #include <cluster/masklog.h>
...@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, ...@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
mlog_entry_void(); mlog_entry_void();
i_size_write(inode, new_i_size); i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME; inode->i_ctime = inode->i_mtime = CURRENT_TIME;
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
...@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, ...@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
{ {
int status; int status;
handle_t *handle; handle_t *handle;
struct ocfs2_dinode *di;
mlog_entry_void(); mlog_entry_void();
...@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, ...@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out; goto out;
} }
status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
}
/*
* Do this before setting i_size.
*/
status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
if (status) {
mlog_errno(status);
goto out_commit;
}
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0) if (status < 0)
mlog_errno(status); mlog_errno(status);
out_commit:
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
out: out:
mlog_exit(status); mlog_exit(status);
return status; return status;
} }
...@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, ...@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
} }
ocfs2_data_unlock(inode, 1);
if (le32_to_cpu(fe->i_clusters) ==
ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
fe->i_clusters);
/* No allocation change is required, so lets fast path
* this truncate. */
status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
if (status < 0)
mlog_errno(status);
goto bail;
}
/* alright, we're going to need to do a full blown alloc size /* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the * change. Orphan the inode so that recovery can complete the
...@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, ...@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
status = ocfs2_commit_truncate(osb, inode, di_bh, tc); status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail_unlock_data;
} }
/* TODO: orphan dir cleanup here. */ /* TODO: orphan dir cleanup here. */
bail_unlock_data:
ocfs2_data_unlock(inode, 1);
bail: bail:
mlog_exit(status); mlog_exit(status);
...@@ -397,6 +416,7 @@ static int ocfs2_truncate_file(struct inode *inode, ...@@ -397,6 +416,7 @@ static int ocfs2_truncate_file(struct inode *inode,
*/ */
int ocfs2_do_extend_allocation(struct ocfs2_super *osb, int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode, struct inode *inode,
u32 *logical_offset,
u32 clusters_to_add, u32 clusters_to_add,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
handle_t *handle, handle_t *handle,
...@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, ...@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
block = ocfs2_clusters_to_blocks(osb->sb, bit_off); block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
mlog(0, "Allocating %u clusters at block %u for inode %llu\n", mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
num_bits, meta_ac); *logical_offset, block, num_bits,
meta_ac);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto leave; goto leave;
} }
le32_add_cpu(&fe->i_clusters, num_bits);
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
status = ocfs2_journal_dirty(handle, fe_bh); status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
...@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, ...@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
} }
clusters_to_add -= num_bits; clusters_to_add -= num_bits;
*logical_offset += num_bits;
if (clusters_to_add) { if (clusters_to_add) {
mlog(0, "need to alloc once more, clusters = %u, wanted = " mlog(0, "need to alloc once more, clusters = %u, wanted = "
...@@ -494,14 +511,87 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, ...@@ -494,14 +511,87 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
return status; return status;
} }
/*
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
*
* Called from ocfs2_extend_allocation() for file systems which don't
* support holes, and from ocfs2_write() for file systems which
* understand sparse inodes.
*/
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
u32 clusters_to_add,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac)
{
int ret, num_free_extents;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
*meta_ac = NULL;
*data_ac = NULL;
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
"clusters_to_add = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
le32_to_cpu(di->i_clusters), clusters_to_add);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
goto out;
}
/*
* Sparse allocation file systems need to be more conservative
* with reserving room for expansion - the actual allocation
* happens while we've got a journal handle open so re-taking
* a cluster lock (because we ran out of room for another
* extent) will violate ordering rules.
*
* Most of the time we'll only be seeing this 1 cluster at a time
* anyway.
*/
if (!num_free_extents ||
(ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
*meta_ac = NULL;
}
/*
* We cannot have an error and a non null *data_ac.
*/
}
return ret;
}
static int ocfs2_extend_allocation(struct inode *inode, static int ocfs2_extend_allocation(struct inode *inode,
u32 clusters_to_add) u32 clusters_to_add)
{ {
int status = 0; int status = 0;
int restart_func = 0; int restart_func = 0;
int drop_alloc_sem = 0; int drop_alloc_sem = 0;
int credits, num_free_extents; int credits;
u32 prev_clusters; u32 prev_clusters, logical_start;
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe = NULL; struct ocfs2_dinode *fe = NULL;
handle_t *handle = NULL; handle_t *handle = NULL;
...@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, ...@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
/*
* This function only exists for file systems which don't
* support holes.
*/
BUG_ON(ocfs2_sparse_alloc(osb));
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode); OCFS2_BH_CACHED, inode);
if (status < 0) { if (status < 0) {
...@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, ...@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
goto leave; goto leave;
} }
logical_start = OCFS2_I(inode)->ip_clusters;
restart_all: restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
"clusters_to_add = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
fe->i_clusters, clusters_to_add);
num_free_extents = ocfs2_num_free_extents(osb,
inode,
fe);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
goto leave;
}
if (!num_free_extents) {
status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
}
status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
/* blocks peope in read/write from reading our allocation /* blocks peope in read/write from reading our allocation
* until we're done changing it. We depend on i_mutex to block * until we're done changing it. We depend on i_mutex to block
* other extend/truncate calls while we're here. Ordering wrt * other extend/truncate calls while we're here. Ordering wrt
...@@ -566,6 +634,13 @@ static int ocfs2_extend_allocation(struct inode *inode, ...@@ -566,6 +634,13 @@ static int ocfs2_extend_allocation(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_alloc_sem); down_write(&OCFS2_I(inode)->ip_alloc_sem);
drop_alloc_sem = 1; drop_alloc_sem = 1;
status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
&meta_ac);
if (status) {
mlog_errno(status);
goto leave;
}
credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
handle = ocfs2_start_trans(osb, credits); handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) { if (IS_ERR(handle)) {
...@@ -590,6 +665,7 @@ static int ocfs2_extend_allocation(struct inode *inode, ...@@ -590,6 +665,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
status = ocfs2_do_extend_allocation(osb, status = ocfs2_do_extend_allocation(osb,
inode, inode,
&logical_start,
clusters_to_add, clusters_to_add,
bh, bh,
handle, handle,
...@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, ...@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
size_t tail_to_skip) size_t tail_to_skip)
{ {
int ret = 0; int ret = 0;
u32 clusters_to_add; u32 clusters_to_add = 0;
BUG_ON(!tail_to_skip && !di_bh); BUG_ON(!tail_to_skip && !di_bh);
...@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, ...@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
goto out; goto out;
BUG_ON(new_i_size < i_size_read(inode)); BUG_ON(new_i_size < i_size_read(inode));
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
BUG_ON(tail_to_skip != 0);
goto out_update_size;
}
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
OCFS2_I(inode)->ip_clusters; OCFS2_I(inode)->ip_clusters;
...@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, ...@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
goto out_unlock; goto out_unlock;
} }
out_update_size:
if (!tail_to_skip) { if (!tail_to_skip) {
/* We're being called from ocfs2_setattr() which wants /* We're being called from ocfs2_setattr() which wants
* us to update i_size */ * us to update i_size */
...@@ -834,6 +916,7 @@ static int ocfs2_extend_file(struct inode *inode, ...@@ -834,6 +916,7 @@ static int ocfs2_extend_file(struct inode *inode,
} }
out_unlock: out_unlock:
if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
ocfs2_data_unlock(inode, 1); ocfs2_data_unlock(inode, 1);
out: out:
...@@ -972,6 +1055,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) ...@@ -972,6 +1055,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
ret = ocfs2_meta_lock(inode, NULL, 0); ret = ocfs2_meta_lock(inode, NULL, 0);
if (ret) { if (ret) {
if (ret != -ENOENT)
mlog_errno(ret); mlog_errno(ret);
goto out; goto out;
} }
...@@ -1035,10 +1119,49 @@ static int ocfs2_write_remove_suid(struct inode *inode) ...@@ -1035,10 +1119,49 @@ static int ocfs2_write_remove_suid(struct inode *inode)
return ret; return ret;
} }
/*
* Will look for holes and unwritten extents in the range starting at
* pos for count bytes (inclusive).
*/
static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
size_t count)
{
int ret = 0;
unsigned int extent_flags;
u32 cpos, clusters, extent_len, phys_cpos;
struct super_block *sb = inode->i_sb;
cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
while (clusters) {
ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
&extent_flags);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
ret = 1;
break;
}
if (extent_len > clusters)
extent_len = clusters;
clusters -= extent_len;
cpos += extent_len;
}
out:
return ret;
}
static int ocfs2_prepare_inode_for_write(struct dentry *dentry, static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
loff_t *ppos, loff_t *ppos,
size_t count, size_t count,
int appending) int appending,
int *direct_io)
{ {
int ret = 0, meta_level = appending; int ret = 0, meta_level = appending;
struct inode *inode = dentry->d_inode; struct inode *inode = dentry->d_inode;
...@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, ...@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
} else { } else {
saved_pos = *ppos; saved_pos = *ppos;
} }
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
loff_t end = saved_pos + count;
/*
* Skip the O_DIRECT checks if we don't need
* them.
*/
if (!direct_io || !(*direct_io))
break;
/*
* Allowing concurrent direct writes means
* i_size changes wouldn't be synchronized, so
* one node could wind up truncating another
* nodes writes.
*/
if (end > i_size_read(inode)) {
*direct_io = 0;
break;
}
/*
* We don't fill holes during direct io, so
* check for them here. If any are found, the
* caller will have to retake some cluster
* locks and initiate the io as buffered.
*/
ret = ocfs2_check_range_for_holes(inode, saved_pos,
count);
if (ret == 1) {
*direct_io = 0;
ret = 0;
} else if (ret < 0)
mlog_errno(ret);
break;
}
/*
* The rest of this loop is concerned with legacy file
* systems which don't support sparse files.
*/
newsize = count + saved_pos; newsize = count + saved_pos;
mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
...@@ -1141,55 +1307,264 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, ...@@ -1141,55 +1307,264 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
return ret; return ret;
} }
static inline void
ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
do {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
} while (bytes);
*iovp = iov;
*basep = base;
}
static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
const struct iovec *cur_iov,
size_t iov_offset)
{
int ret;
char *buf;
struct page *src_page = NULL;
buf = cur_iov->iov_base + iov_offset;
if (!segment_eq(get_fs(), KERNEL_DS)) {
/*
* Pull in the user page. We want to do this outside
* of the meta data locks in order to preserve locking
* order in case of page fault.
*/
ret = get_user_pages(current, current->mm,
(unsigned long)buf & PAGE_CACHE_MASK, 1,
0, 0, &src_page, NULL);
if (ret == 1)
bp->b_src_buf = kmap(src_page);
else
src_page = ERR_PTR(-EFAULT);
} else {
bp->b_src_buf = buf;
}
return src_page;
}
static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
struct page *page)
{
if (page) {
kunmap(page);
page_cache_release(page);
}
}
static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
const struct iovec *iov,
unsigned long nr_segs,
size_t count,
ssize_t o_direct_written)
{
int ret = 0;
ssize_t copied, total = 0;
size_t iov_offset = 0;
const struct iovec *cur_iov = iov;
struct ocfs2_buffered_write_priv bp;
struct page *page;
/*
* handle partial DIO write. Adjust cur_iov if needed.
*/
ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
do {
bp.b_cur_off = iov_offset;
bp.b_cur_iov = cur_iov;
page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
copied = ocfs2_buffered_write_cluster(file, *ppos, count,
ocfs2_map_and_write_user_data,
&bp);
ocfs2_put_write_source(&bp, page);
if (copied < 0) {
mlog_errno(copied);
ret = copied;
goto out;
}
total += copied;
*ppos = *ppos + copied;
count -= copied;
ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
} while(count);
out:
return total ? total : ret;
}
static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
unsigned long *nr_segs)
{
size_t ocount; /* original count */
unsigned long seg;
ocount = 0;
for (seg = 0; seg < *nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
ocount += iv->iov_len;
if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
*nr_segs = seg;
ocount -= iv->iov_len; /* This segment is no good */
break;
}
*counted = ocount;
return 0;
}
static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
const struct iovec *iov, const struct iovec *iov,
unsigned long nr_segs, unsigned long nr_segs,
loff_t pos) loff_t pos)
{ {
int ret, rw_level, have_alloc_sem = 0; int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
struct file *filp = iocb->ki_filp; int can_do_direct, sync = 0;
struct inode *inode = filp->f_path.dentry->d_inode; ssize_t written = 0;
int appending = filp->f_flags & O_APPEND ? 1 : 0; size_t ocount; /* original count */
size_t count; /* after file limit checks */
mlog_entry("(0x%p, %u, '%.*s')\n", filp, loff_t *ppos = &iocb->ki_pos;
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs, (unsigned int)nr_segs,
filp->f_path.dentry->d_name.len, file->f_path.dentry->d_name.len,
filp->f_path.dentry->d_name.name); file->f_path.dentry->d_name.name);
/* happy write of zero bytes */
if (iocb->ki_left == 0) if (iocb->ki_left == 0)
return 0; return 0;
ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
if (ret)
return ret;
count = ocount;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
relock:
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
if (filp->f_flags & O_DIRECT) { if (direct_io) {
have_alloc_sem = 1;
down_read(&inode->i_alloc_sem); down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
} }
/* concurrent O_DIRECT writes are allowed */ /* concurrent O_DIRECT writes are allowed */
rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; rw_level = !direct_io;
ret = ocfs2_rw_lock(inode, rw_level); ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) { if (ret < 0) {
rw_level = -1;
mlog_errno(ret); mlog_errno(ret);
goto out; goto out_sems;
} }
ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, can_do_direct = direct_io;
iocb->ki_left, appending); ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
iocb->ki_left, appending,
&can_do_direct);
if (ret < 0) { if (ret < 0) {
mlog_errno(ret); mlog_errno(ret);
goto out; goto out;
} }
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
direct_io = 0;
sync = 1;
goto relock;
}
if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
sync = 1;
/*
* XXX: Is it ok to execute these checks a second time?
*/
ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
if (ret)
goto out;
/*
* Set pos so that sync_page_range_nolock() below understands
* where to start from. We might've moved it around via the
* calls above. The range we want to actually sync starts from
* *ppos here.
*
*/
pos = *ppos;
/* communicate with ocfs2_dio_end_io */ /* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb); ocfs2_iocb_set_rw_locked(iocb, rw_level);
ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); if (direct_io) {
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
ret = written;
goto out_dio;
}
} else {
written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
count, written);
if (written < 0) {
ret = written;
if (ret != -EFAULT || ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
}
out_dio:
/* buffered aio wouldn't have proper lock coverage today */ /* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
/* /*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
...@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, ...@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
} }
out: out:
if (have_alloc_sem)
up_read(&inode->i_alloc_sem);
if (rw_level != -1) if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level); ocfs2_rw_unlock(inode, rw_level);
out_sems:
if (have_alloc_sem)
up_read(&inode->i_alloc_sem);
if (written > 0 && sync) {
ssize_t err;
err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
if (err < 0)
written = err;
}
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
mlog_exit(ret); mlog_exit(ret);
return written ? written : ret;
}
static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
struct pipe_buffer *buf,
struct splice_desc *sd)
{
int ret, count, total = 0;
ssize_t copied = 0;
struct ocfs2_splice_write_priv sp;
ret = buf->ops->pin(pipe, buf);
if (ret)
goto out;
sp.s_sd = sd;
sp.s_buf = buf;
sp.s_pipe = pipe;
sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
sp.s_buf_offset = buf->offset;
count = sd->len;
if (count + sp.s_offset > PAGE_CACHE_SIZE)
count = PAGE_CACHE_SIZE - sp.s_offset;
do {
/*
* splice wants us to copy up to one page at a
* time. For pagesize > cluster size, this means we
* might enter ocfs2_buffered_write_cluster() more
* than once, so keep track of our progress here.
*/
copied = ocfs2_buffered_write_cluster(sd->file,
(loff_t)sd->pos + total,
count,
ocfs2_map_and_write_splice_data,
&sp);
if (copied < 0) {
mlog_errno(copied);
ret = copied;
goto out;
}
count -= copied;
sp.s_offset += copied;
sp.s_buf_offset += copied;
total += copied;
} while (count);
ret = 0;
out:
return total ? total : ret;
}
static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out,
loff_t *ppos,
size_t len,
unsigned int flags)
{
int ret, err;
struct address_space *mapping = out->f_mapping;
struct inode *inode = mapping->host;
ret = __splice_from_pipe(pipe, out, ppos, len, flags,
ocfs2_splice_write_actor);
if (ret > 0) {
*ppos += ret;
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
if (err)
ret = err;
}
}
return ret; return ret;
} }
...@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, ...@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
goto out; goto out;
} }
ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
NULL);
if (ret < 0) { if (ret < 0) {
mlog_errno(ret); mlog_errno(ret);
goto out_unlock; goto out_unlock;
} }
/* ok, we're done with i_size and alloc work */ /* ok, we're done with i_size and alloc work */
ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
out_unlock: out_unlock:
ocfs2_rw_unlock(inode, 1); ocfs2_rw_unlock(inode, 1);
...@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ...@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
} }
rw_level = 0; rw_level = 0;
/* communicate with ocfs2_dio_end_io */ /* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb); ocfs2_iocb_set_rw_locked(iocb, rw_level);
} }
/* /*
......
...@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted { ...@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
}; };
int ocfs2_do_extend_allocation(struct ocfs2_super *osb, int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode, struct inode *inode,
u32 *cluster_start,
u32 clusters_to_add, u32 clusters_to_add,
struct buffer_head *fe_bh, struct buffer_head *fe_bh,
handle_t *handle, handle_t *handle,
struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason); enum ocfs2_alloc_restarted *reason);
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
u32 clusters_to_add,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat); struct kstat *stat);
......
...@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode) ...@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC; inode->i_flags |= S_DIRSYNC;
} }
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
int delete_vote)
{
struct ocfs2_find_inode_args args;
/* ocfs2_ilookup_for_vote should *only* be called from the
* vote thread */
BUG_ON(current != osb->vote_task);
args.fi_blkno = blkno;
args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
if (delete_vote)
args.fi_flags |= OCFS2_FI_FLAG_DELETE;
args.fi_ino = ino_from_blkno(osb->sb, blkno);
return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
}
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
{ {
struct inode *inode = NULL; struct inode *inode = NULL;
...@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) ...@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
if (oi->ip_blkno != args->fi_blkno) if (oi->ip_blkno != args->fi_blkno)
goto bail; goto bail;
/* OCFS2_FI_FLAG_NOWAIT is *only* set from
* ocfs2_ilookup_for_vote which won't create an inode for one
* that isn't found. The vote thread which doesn't want to get
* an inode which is in the process of going away - otherwise
* the call to __wait_on_freeing_inode in find_inode_fast will
* cause it to deadlock on an inode which may be waiting on a
* vote (or lock release) in delete_inode */
if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
(inode->i_state & (I_FREEING|I_CLEAR))) {
/* As stated above, we're not going to return an
* inode. In the case of a delete vote, the voting
* code is going to signal the other node to go
* ahead. Mark that state here, so this freeing inode
* has the state when it gets to delete_inode. */
if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
spin_lock(&oi->ip_lock);
ocfs2_mark_inode_remotely_deleted(inode);
spin_unlock(&oi->ip_lock);
}
goto bail;
}
ret = 1; ret = 1;
bail: bail:
mlog_exit(ret); mlog_exit(ret);
...@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ...@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
goto bail; goto bail;
} }
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_version = 1; inode->i_version = 1;
inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
...@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ...@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
if (S_ISLNK(inode->i_mode) && !fe->i_clusters) if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
inode->i_blocks = 0; inode->i_blocks = 0;
else else
inode->i_blocks = inode->i_blocks = ocfs2_inode_sector_count(inode);
ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
inode->i_mapping->a_ops = &ocfs2_aops; inode->i_mapping->a_ops = &ocfs2_aops;
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
...@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ...@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)fe->i_blkno); (unsigned long long)fe->i_blkno);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_nlink = le16_to_cpu(fe->i_links_count); inode->i_nlink = le16_to_cpu(fe->i_links_count);
if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
...@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ...@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode); OCFS2_LOCK_TYPE_META, 0, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN, 0, inode);
} }
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
...@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, ...@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway. * cluster lock before trusting anything anyway.
*/ */
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
&& !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb); && !ocfs2_mount_local(osb);
/* /*
...@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, ...@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META, OCFS2_LOCK_TYPE_META,
generation, inode); generation, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN,
0, inode);
if (can_lock) { if (can_lock) {
status = ocfs2_open_lock(inode);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
return status;
}
status = ocfs2_meta_lock(inode, NULL, 0); status = ocfs2_meta_lock(inode, NULL, 0);
if (status) { if (status) {
make_bad_inode(inode); make_bad_inode(inode);
...@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, ...@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
} }
} }
if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
status = ocfs2_try_open_lock(inode, 0);
if (status) {
make_bad_inode(inode);
return status;
}
}
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL); can_lock ? inode : NULL);
if (status < 0) { if (status < 0) {
...@@ -507,30 +486,35 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ...@@ -507,30 +486,35 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh) struct buffer_head *fe_bh)
{ {
int status = 0; int status = 0;
handle_t *handle = NULL;
struct ocfs2_truncate_context *tc = NULL; struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe; struct ocfs2_dinode *fe;
handle_t *handle = NULL;
mlog_entry_void(); mlog_entry_void();
fe = (struct ocfs2_dinode *) fe_bh->b_data; fe = (struct ocfs2_dinode *) fe_bh->b_data;
/* zero allocation, zero truncate :) */ if (fe->i_clusters) {
if (!fe->i_clusters)
goto bail;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) { if (IS_ERR(handle)) {
status = PTR_ERR(handle); status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status); mlog_errno(status);
goto bail; goto out;
} }
status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); status = ocfs2_journal_access(handle, inode, fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto out;
}
i_size_write(inode, 0);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
} }
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
...@@ -539,18 +523,19 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ...@@ -539,18 +523,19 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto out;
} }
status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto out;
} }
bail: }
out:
if (handle) if (handle)
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
mlog_exit(status); mlog_exit(status);
return status; return status;
} }
...@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode, ...@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL; struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL; struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
/* We've already voted on this so it should be readonly - no di = (struct ocfs2_dinode *) di_bh->b_data;
* spinlock needed. */ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status) if (status)
...@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode, ...@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail; goto bail;
} }
status = ocfs2_request_delete_vote(inode); /*
/* -EBUSY means that other nodes are still using the * This is how ocfs2 determines whether an inode is still live
* inode. We're done here though, so avoid doing anything on * within the cluster. Every node takes a shared read lock on
* disk and let them worry about deleting it. */ * the inode open lock in ocfs2_read_locked_inode(). When we
if (status == -EBUSY) { * get to ->delete_inode(), each node tries to convert it's
* lock to an exclusive. Trylocks are serialized by the inode
* meta data lock. If the upconvert suceeds, we know the inode
* is no longer live and can be deleted.
*
* Though we call this with the meta data lock held, the
* trylock keeps us from ABBA deadlock.
*/
status = ocfs2_try_open_lock(inode, 1);
if (status == -EAGAIN) {
status = 0; status = 0;
mlog(0, "Skipping delete of %llu because it is in use on" mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno); "other nodes\n", (unsigned long long)oi->ip_blkno);
...@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, ...@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail; goto bail;
} }
spin_lock(&oi->ip_lock);
if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
/* Nobody knew which slot this inode was orphaned
* into. This may happen during node death and
* recovery knows how to clean it up so we can safely
* ignore this inode for now on. */
mlog(0, "Nobody knew where inode %llu was orphaned!\n",
(unsigned long long)oi->ip_blkno);
} else {
*wipe = 1; *wipe = 1;
mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", (unsigned long long)oi->ip_blkno,
(unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); le16_to_cpu(di->i_orphaned_slot));
}
spin_unlock(&oi->ip_lock);
bail: bail:
return status; return status;
...@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode) ...@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino); "Inode=%lu\n", inode->i_ino);
/* For remove delete_inode vote, we hold open lock before,
* now it is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce /* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */ * the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes /* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster * metadata has hit disk. Of course, we can't drop any cluster
...@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode) ...@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n", "Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno); (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_drop(inode, 0); ocfs2_extent_map_trunc(inode, 0);
ocfs2_extent_map_init(inode);
status = ocfs2_drop_inode_locks(inode); status = ocfs2_drop_inode_locks(inode);
if (status < 0) if (status < 0)
...@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode) ...@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres); ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres); ocfs2_lock_res_free(&oi->ip_data_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode); ocfs2_metadata_cache_purge(inode);
...@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode) ...@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
/* Testing ip_orphaned_slot here wouldn't work because we may
* not have gotten a delete_inode vote from any other nodes
* yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode); generic_delete_inode(inode);
else else
...@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, ...@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
return NULL; return NULL;
} }
tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
&p_blkno, NULL); NULL);
if (tmperr < 0) { if (tmperr < 0) {
mlog_errno(tmperr); mlog_errno(tmperr);
goto fail; goto fail;
...@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode, ...@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0; inode->i_blocks = 0;
else else
inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#ifndef OCFS2_INODE_H #ifndef OCFS2_INODE_H
#define OCFS2_INODE_H #define OCFS2_INODE_H
#include "extent_map.h"
/* OCFS2 Inode Private Data */ /* OCFS2 Inode Private Data */
struct ocfs2_inode_info struct ocfs2_inode_info
{ {
...@@ -34,6 +36,7 @@ struct ocfs2_inode_info ...@@ -34,6 +36,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres; struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres; struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres; struct ocfs2_lock_res ip_data_lockres;
struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */ /* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem; struct rw_semaphore ip_alloc_sem;
...@@ -42,9 +45,7 @@ struct ocfs2_inode_info ...@@ -42,9 +45,7 @@ struct ocfs2_inode_info
spinlock_t ip_lock; spinlock_t ip_lock;
u32 ip_open_count; u32 ip_open_count;
u32 ip_clusters; u32 ip_clusters;
struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers; struct list_head ip_io_markers;
int ip_orphaned_slot;
struct mutex ip_io_mutex; struct mutex ip_io_mutex;
...@@ -64,6 +65,8 @@ struct ocfs2_inode_info ...@@ -64,6 +65,8 @@ struct ocfs2_inode_info
struct ocfs2_caching_info ip_metadata_cache; struct ocfs2_caching_info ip_metadata_cache;
struct ocfs2_extent_map ip_extent_map;
struct inode vfs_inode; struct inode vfs_inode;
}; };
...@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode); ...@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */ /* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
#define OCFS2_FI_FLAG_SYSFILE 0x4 #define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_NOLOCK 0x8 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
int delete_vote);
int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry); int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
...@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); ...@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_set_inode_flags(struct inode *inode);
static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
}
#endif /* OCFS2_INODE_H */ #endif /* OCFS2_INODE_H */
...@@ -649,29 +649,20 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) ...@@ -649,29 +649,20 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
static int ocfs2_force_read_journal(struct inode *inode) static int ocfs2_force_read_journal(struct inode *inode)
{ {
int status = 0; int status = 0;
int i, p_blocks; int i;
u64 v_blkno, p_blkno; u64 v_blkno, p_blkno, p_blocks, num_blocks;
#define CONCURRENT_JOURNAL_FILL 32 #define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
mlog_entry_void(); mlog_entry_void();
BUG_ON(inode->i_blocks !=
ocfs2_align_bytes_to_sectors(i_size_read(inode)));
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
mlog(0, "Force reading %llu blocks\n", num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
(unsigned long long)(inode->i_blocks >>
(inode->i_sb->s_blocksize_bits - 9)));
v_blkno = 0; v_blkno = 0;
while (v_blkno < while (v_blkno < num_blocks) {
(inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno, status = ocfs2_extent_map_get_blocks(inode, v_blkno,
1, &p_blkno, &p_blkno, &p_blocks, NULL);
&p_blocks);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
...@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, ...@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
continue; continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode), iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
OCFS2_FI_FLAG_NOLOCK); OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter)) if (IS_ERR(iter))
continue; continue;
...@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, ...@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
/* Set the proper information to get us going into /* Set the proper information to get us going into
* ocfs2_delete_inode. */ * ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock); spin_unlock(&oi->ip_lock);
iput(inode); iput(inode);
......
...@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, ...@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* We may be deleting metadata blocks, so metadata alloc dinode + /* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */ one desc. block for each possible delete. */
if (tree_depth && next_free == 1 && if (tree_depth && next_free == 1 &&
le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
credits += 1 + tree_depth; credits += 1 + tree_depth;
/* update to the truncate log. */ /* update to the truncate log. */
......
...@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
int ret = 0, lock_level = 0; int ret = 0, lock_level = 0;
struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
/* We don't want to support shared writable mappings yet. */ /*
if (!ocfs2_mount_local(osb) && * Only support shared writeable mmap for local mounts which
* don't know about holes.
*/
if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
......
...@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, ...@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
if (IS_ERR(inode)) { if (IS_ERR(inode)) {
mlog(ML_ERROR, "Unable to create inode %llu\n",
(unsigned long long)blkno);
ret = ERR_PTR(-EACCES); ret = ERR_PTR(-EACCES);
goto bail_unlock; goto bail_unlock;
} }
...@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, ...@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */ * unlink. */
spin_lock(&oi->ip_lock); spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock); spin_unlock(&oi->ip_lock);
bail_add: bail_add:
...@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb, ...@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
i_size_write(inode, inode->i_sb->s_blocksize); i_size_write(inode, inode->i_sb->s_blocksize);
inode->i_nlink = 2; inode->i_nlink = 2;
inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
...@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, ...@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct buffer_head **bhs = NULL; struct buffer_head **bhs = NULL;
const char *c; const char *c;
struct super_block *sb = osb->sb; struct super_block *sb = osb->sb;
u64 p_blkno; u64 p_blkno, p_blocks;
int p_blocks;
int virtual, blocks, status, i, bytes_left; int virtual, blocks, status, i, bytes_left;
bytes_left = i_size_read(inode) + 1; bytes_left = i_size_read(inode) + 1;
...@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, ...@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail; goto bail;
} }
status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
&p_blocks); NULL);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
...@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir, ...@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0; inode->i_rdev = 0;
newsize = l - 1; newsize = l - 1;
if (l > ocfs2_fast_symlink_chars(sb)) { if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations; inode->i_op = &ocfs2_symlink_inode_operations;
status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
new_fe_bh,
handle, data_ac, NULL, handle, data_ac, NULL,
NULL); NULL);
if (status < 0) { if (status < 0) {
...@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir, ...@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
goto bail; goto bail;
} }
i_size_write(inode, newsize); i_size_write(inode, newsize);
inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); inode->i_blocks = ocfs2_inode_sector_count(inode);
} else { } else {
inode->i_op = &ocfs2_fast_symlink_inode_operations; inode->i_op = &ocfs2_fast_symlink_inode_operations;
memcpy((char *) fe->id2.i_symlink, symname, l); memcpy((char *) fe->id2.i_symlink, symname, l);
...@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ...@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
/* Record which orphan dir our inode now resides /* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan * in. delete_inode will use this to determine which orphan
* dir to lock. */ * dir to lock. */
spin_lock(&OCFS2_I(inode)->ip_lock); fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
spin_unlock(&OCFS2_I(inode)->ip_lock);
mlog(0, "Inode %llu orphaned in slot %d\n", mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
......
...@@ -46,11 +46,6 @@ ...@@ -46,11 +46,6 @@
#include "endian.h" #include "endian.h"
#include "ocfs2_lockid.h" #include "ocfs2_lockid.h"
struct ocfs2_extent_map {
u32 em_clusters;
struct rb_root em_extents;
};
/* Most user visible OCFS2 inodes will have very few pieces of /* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken * metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small * into account when designing an access scheme. We allow a small
...@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) ...@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
return 1; return 1;
} }
static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
return 1;
return 0;
}
/* set / clear functions because cluster events can make these happen /* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also * in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock * means that any future flags osb_flags must be protected by spinlock
...@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) ...@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
return (unsigned long)((bytes + 511) >> 9); return (unsigned long)((bytes + 511) >> 9);
} }
static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
unsigned long pg_index)
{
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
if (unlikely(PAGE_CACHE_SHIFT > cbits))
clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
else if (PAGE_CACHE_SHIFT < cbits)
clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
return clusters;
}
/*
* Find the 1st page index which covers the given clusters.
*/
static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
u32 clusters)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned long index = clusters;
if (PAGE_CACHE_SHIFT > cbits) {
index = clusters >> (PAGE_CACHE_SHIFT - cbits);
} else if (PAGE_CACHE_SHIFT < cbits) {
index = clusters << (cbits - PAGE_CACHE_SHIFT);
}
return index;
}
static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
{
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
if (PAGE_CACHE_SHIFT < cbits)
pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
return pages_per_cluster;
}
#define ocfs2_set_bit ext2_set_bit #define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit #define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit #define ocfs2_test_bit ext2_test_bit
......
...@@ -86,7 +86,8 @@ ...@@ -86,7 +86,8 @@
OCFS2_SB(sb)->s_feature_incompat &= ~(mask) OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 #define OCFS2_FEATURE_RO_COMPAT_SUPP 0
/* /*
...@@ -154,6 +155,12 @@ ...@@ -154,6 +155,12 @@
#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ #define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
/*
* Extent record flags (e_node.leaf.flags)
*/
#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
* unwritten */
/* /*
* ioctl commands * ioctl commands
*/ */
...@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { ...@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
/* /*
* On disk extent record for OCFS2 * On disk extent record for OCFS2
* It describes a range of clusters on disk. * It describes a range of clusters on disk.
*
* Length fields are divided into interior and leaf node versions.
* This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
*/ */
struct ocfs2_extent_rec { struct ocfs2_extent_rec {
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
__le32 e_clusters; /* Clusters covered by this extent */ union {
__le32 e_int_clusters; /* Clusters covered by all children */
struct {
__le16 e_leaf_clusters; /* Clusters covered by this
extent */
__u8 e_reserved1;
__u8 e_flags; /* Extent flags */
};
};
__le64 e_blkno; /* Physical disk offset, in blocks */ __le64 e_blkno; /* Physical disk offset, in blocks */
/*10*/ /*10*/
}; };
...@@ -311,7 +329,10 @@ struct ocfs2_extent_list { ...@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this /*00*/ __le16 l_tree_depth; /* Extent tree depth from this
point. 0 means data extents point. 0 means data extents
hang directly off this hang directly off this
header (a leaf) */ header (a leaf)
NOTE: The high 8 bits cannot be
used - tree_depth is never that big.
*/
__le16 l_count; /* Number of extent records */ __le16 l_count; /* Number of extent records */
__le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_next_free_rec; /* Next unused extent slot */
__le16 l_reserved1; __le16 l_reserved1;
...@@ -446,7 +467,9 @@ struct ocfs2_dinode { ...@@ -446,7 +467,9 @@ struct ocfs2_dinode {
__le32 i_ctime_nsec; __le32 i_ctime_nsec;
__le32 i_mtime_nsec; __le32 i_mtime_nsec;
__le32 i_attr; __le32 i_attr;
__le32 i_reserved1; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
was set in i_flags */
__le16 i_reserved1;
/*70*/ __le64 i_reserved2[8]; /*70*/ __le64 i_reserved2[8];
/*B8*/ union { /*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this __le64 i_pad1; /* Generic way to refer to this
......
...@@ -44,6 +44,7 @@ enum ocfs2_lock_type { ...@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES OCFS2_NUM_LOCK_TYPES
}; };
...@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) ...@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY: case OCFS2_LOCK_TYPE_DENTRY:
c = 'N'; c = 'N';
break; break;
case OCFS2_LOCK_TYPE_OPEN:
c = 'O';
break;
default: default:
c = '\0'; c = '\0';
} }
...@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { ...@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */ * important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
}; };
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
......
...@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) ...@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail; goto bail;
} }
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, ...@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_to_cpu(fe->i_clusters))); le32_to_cpu(fe->i_clusters)));
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
status = 0; status = 0;
bail: bail:
......
...@@ -806,9 +806,6 @@ static int __init ocfs2_init(void) ...@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
ocfs2_print_version(); ocfs2_print_version();
if (init_ocfs2_extent_maps())
return -ENOMEM;
status = init_ocfs2_uptodate_cache(); status = init_ocfs2_uptodate_cache();
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
...@@ -837,7 +834,6 @@ static int __init ocfs2_init(void) ...@@ -837,7 +834,6 @@ static int __init ocfs2_init(void)
if (status < 0) { if (status < 0) {
ocfs2_free_mem_caches(); ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache(); exit_ocfs2_uptodate_cache();
exit_ocfs2_extent_maps();
} }
mlog_exit(status); mlog_exit(status);
...@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) ...@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type); unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_extent_maps();
exit_ocfs2_uptodate_cache(); exit_ocfs2_uptodate_cache();
mlog_exit_void(); mlog_exit_void();
...@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data, ...@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres); ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres); ocfs2_lock_res_init_once(&oi->ip_data_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode); ocfs2_metadata_cache_init(&oi->vfs_inode);
......
...@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr ...@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
__be32 h_node_num; /* node sending this particular message. */ __be32 h_node_num; /* node sending this particular message. */
}; };
/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
* for the network. */
#define OCFS2_VOTE_FILENAME_LEN 256
struct ocfs2_vote_msg struct ocfs2_vote_msg
{ {
struct ocfs2_msg_hdr v_hdr; struct ocfs2_msg_hdr v_hdr;
union { __be32 v_reserved1;
__be32 v_generic1;
__be32 v_orphaned_slot; /* Used during delete votes */
__be32 v_nlink; /* Used during unlink votes */
} md1; /* Message type dependant 1 */
}; };
/* Responses are given these values to maintain backwards /* Responses are given these values to maintain backwards
...@@ -86,7 +79,6 @@ struct ocfs2_response_msg ...@@ -86,7 +79,6 @@ struct ocfs2_response_msg
{ {
struct ocfs2_msg_hdr r_hdr; struct ocfs2_msg_hdr r_hdr;
__be32 r_response; __be32 r_response;
__be32 r_orphaned_slot;
}; };
struct ocfs2_vote_work { struct ocfs2_vote_work {
...@@ -96,7 +88,6 @@ struct ocfs2_vote_work { ...@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
enum ocfs2_vote_request { enum ocfs2_vote_request {
OCFS2_VOTE_REQ_INVALID = 0, OCFS2_VOTE_REQ_INVALID = 0,
OCFS2_VOTE_REQ_DELETE,
OCFS2_VOTE_REQ_MOUNT, OCFS2_VOTE_REQ_MOUNT,
OCFS2_VOTE_REQ_UMOUNT, OCFS2_VOTE_REQ_UMOUNT,
OCFS2_VOTE_REQ_LAST OCFS2_VOTE_REQ_LAST
...@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb, ...@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
} }
void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
assert_spin_locked(&oi->ip_lock);
/* We set the SKIP_DELETE flag on the inode so we don't try to
* delete it in delete_inode ourselves, thus avoiding
* unecessary lock pinging. If the other node failed to wipe
* the inode as a result of a crash, then recovery will pick
* up the slack. */
oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
}
static int ocfs2_process_delete_request(struct inode *inode,
int *orphaned_slot)
{
int response = OCFS2_RESPONSE_BUSY;
mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
inode->i_ino, inode->i_nlink, *orphaned_slot);
spin_lock(&OCFS2_I(inode)->ip_lock);
/* Whatever our vote response is, we want to make sure that
* the orphaned slot is recorded properly on this node *and*
* on the requesting node. Technically, if the requesting node
* did not know which slot the inode is orphaned in but we
* respond with BUSY he doesn't actually need the orphaned
* slot, but it doesn't hurt to do it here anyway. */
if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
OCFS2_INVALID_SLOT &&
OCFS2_I(inode)->ip_orphaned_slot !=
(*orphaned_slot),
"Inode %llu: This node thinks it's "
"orphaned in slot %d, messaged it's in %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
OCFS2_I(inode)->ip_orphaned_slot,
*orphaned_slot);
mlog(0, "Setting orphaned slot for inode %llu to %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
*orphaned_slot);
OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
} else {
mlog(0, "Sending back orphaned slot %d for inode %llu\n",
OCFS2_I(inode)->ip_orphaned_slot,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
}
/* vote no if the file is still open. */
if (OCFS2_I(inode)->ip_open_count) {
mlog(0, "open count = %u\n",
OCFS2_I(inode)->ip_open_count);
spin_unlock(&OCFS2_I(inode)->ip_lock);
goto done;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* directories are a bit ugly... What if someone is sitting in
* it? We want to make sure the inode is removed completely as
* a result of the iput in process_vote. */
if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
goto done;
}
if (filemap_fdatawrite(inode->i_mapping)) {
mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
goto done;
}
sync_mapping_buffers(inode->i_mapping);
truncate_inode_pages(inode->i_mapping, 0);
ocfs2_extent_map_trunc(inode, 0);
spin_lock(&OCFS2_I(inode)->ip_lock);
/* double check open count - someone might have raced this
* thread into ocfs2_file_open while we were writing out
* data. If we're to allow a wipe of this inode now, we *must*
* hold the spinlock until we've marked it. */
if (OCFS2_I(inode)->ip_open_count) {
mlog(0, "Raced to wipe! open count = %u\n",
OCFS2_I(inode)->ip_open_count);
spin_unlock(&OCFS2_I(inode)->ip_lock);
goto done;
}
/* Mark the inode as being wiped from disk. */
ocfs2_mark_inode_remotely_deleted(inode);
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Not sure this is necessary anymore. */
d_prune_aliases(inode);
/* If we get here, then we're voting 'yes', so commit the
* delete on our side. */
response = OCFS2_RESPONSE_OK;
done:
return response;
}
static void ocfs2_process_vote(struct ocfs2_super *osb, static void ocfs2_process_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *msg) struct ocfs2_vote_msg *msg)
{ {
int net_status, vote_response; int net_status, vote_response;
int orphaned_slot = 0; unsigned int node_num;
unsigned int node_num, generation;
u64 blkno; u64 blkno;
enum ocfs2_vote_request request; enum ocfs2_vote_request request;
struct inode *inode = NULL;
struct ocfs2_msg_hdr *hdr = &msg->v_hdr; struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
struct ocfs2_response_msg response; struct ocfs2_response_msg response;
/* decode the network mumbo jumbo into local variables. */ /* decode the network mumbo jumbo into local variables. */
request = be32_to_cpu(hdr->h_request); request = be32_to_cpu(hdr->h_request);
blkno = be64_to_cpu(hdr->h_blkno); blkno = be64_to_cpu(hdr->h_blkno);
generation = be32_to_cpu(hdr->h_generation);
node_num = be32_to_cpu(hdr->h_node_num); node_num = be32_to_cpu(hdr->h_node_num);
if (request == OCFS2_VOTE_REQ_DELETE)
orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
mlog(0, "processing vote: request = %u, blkno = %llu, " mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
"generation = %u, node_num = %u, priv1 = %u\n", request, request, (unsigned long long)blkno, node_num);
(unsigned long long)blkno, generation, node_num,
be32_to_cpu(msg->md1.v_generic1));
if (!ocfs2_is_valid_vote_request(request)) { if (!ocfs2_is_valid_vote_request(request)) {
mlog(ML_ERROR, "Invalid vote request %d from node %u\n", mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
...@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, ...@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
break; break;
} }
/* We cannot process the remaining message types before we're
* fully mounted. It's perfectly safe however to send a 'yes'
* response as we can't possibly have any of the state they're
* asking us to modify yet. */
if (atomic_read(&osb->vol_state) == VOLUME_INIT)
goto respond;
/* If we get here, then the request is against an inode. */
inode = ocfs2_ilookup_for_vote(osb, blkno,
request == OCFS2_VOTE_REQ_DELETE);
/* Not finding the inode is perfectly valid - it means we're
* not interested in what the other node is about to do to it
* so in those cases we automatically respond with an
* affirmative. Cluster locking ensures that we won't race
* interest in the inode with this vote request. */
if (!inode)
goto respond;
/* Check generation values. It's possible for us to get a
* request against a stale inode. If so then we proceed as if
* we had not found an inode in the first place. */
if (inode->i_generation != generation) {
mlog(0, "generation passed %u != inode generation = %u, "
"ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
"message type = %u\n", generation, inode->i_generation,
OCFS2_I(inode)->ip_flags,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)blkno, atomic_read(&inode->i_count),
request);
iput(inode);
inode = NULL;
goto respond;
}
switch (request) {
case OCFS2_VOTE_REQ_DELETE:
vote_response = ocfs2_process_delete_request(inode,
&orphaned_slot);
break;
default:
mlog(ML_ERROR, "node %u, invalid request: %u\n",
node_num, request);
vote_response = OCFS2_RESPONSE_BAD_MSG;
}
respond: respond:
/* Response struture is small so we just put it on the stack /* Response struture is small so we just put it on the stack
* and stuff it inline. */ * and stuff it inline. */
...@@ -357,7 +190,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, ...@@ -357,7 +190,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
response.r_hdr.h_generation = hdr->h_generation; response.r_hdr.h_generation = hdr->h_generation;
response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
response.r_response = cpu_to_be32(vote_response); response.r_response = cpu_to_be32(vote_response);
response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
osb->net_key, osb->net_key,
...@@ -373,9 +205,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, ...@@ -373,9 +205,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
&& net_status != -ENOTCONN) && net_status != -ENOTCONN)
mlog(ML_ERROR, "message to node %u fails with error %d!\n", mlog(ML_ERROR, "message to node %u fails with error %d!\n",
node_num, net_status); node_num, net_status);
if (inode)
iput(inode);
} }
static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
...@@ -634,8 +463,7 @@ static int ocfs2_broadcast_vote(struct ocfs2_super *osb, ...@@ -634,8 +463,7 @@ static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
u64 blkno, u64 blkno,
unsigned int generation, unsigned int generation,
enum ocfs2_vote_request type, enum ocfs2_vote_request type)
u32 priv)
{ {
struct ocfs2_vote_msg *request; struct ocfs2_vote_msg *request;
struct ocfs2_msg_hdr *hdr; struct ocfs2_msg_hdr *hdr;
...@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, ...@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
hdr->h_request = cpu_to_be32(type); hdr->h_request = cpu_to_be32(type);
hdr->h_blkno = cpu_to_be64(blkno); hdr->h_blkno = cpu_to_be64(blkno);
hdr->h_generation = cpu_to_be32(generation); hdr->h_generation = cpu_to_be32(generation);
request->md1.v_generic1 = cpu_to_be32(priv);
} }
return request; return request;
...@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, ...@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
struct ocfs2_vote_msg *request, struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback) struct ocfs2_net_response_cb *callback)
{ {
int status, response; int status, response = -EBUSY;
unsigned int response_id; unsigned int response_id;
struct ocfs2_msg_hdr *hdr; struct ocfs2_msg_hdr *hdr;
...@@ -686,109 +512,12 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, ...@@ -686,109 +512,12 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
return status; return status;
} }
static int ocfs2_request_vote(struct inode *inode,
struct ocfs2_vote_msg *request,
struct ocfs2_net_response_cb *callback)
{
int status;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (ocfs2_inode_is_new(inode))
return 0;
status = -EAGAIN;
while (status == -EAGAIN) {
if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
signal_pending(current))
return -ERESTARTSYS;
status = ocfs2_super_lock(osb, 0);
if (status < 0) {
mlog_errno(status);
break;
}
status = 0;
if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
osb->node_num))
status = ocfs2_do_request_vote(osb, request, callback);
ocfs2_super_unlock(osb, 0);
}
return status;
}
static void ocfs2_delete_response_cb(void *priv,
struct ocfs2_response_msg *resp)
{
int orphaned_slot, node;
struct inode *inode = priv;
orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
node = be32_to_cpu(resp->r_hdr.h_node_num);
mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
orphaned_slot);
/* The other node may not actually know which slot the inode
* is orphaned in. */
if (orphaned_slot == OCFS2_INVALID_SLOT)
return;
/* Ok, the responding node knows which slot this inode is
* orphaned in. We verify that the information is correct and
* then record this in the inode. ocfs2_delete_inode will use
* this information to determine which lock to take. */
spin_lock(&OCFS2_I(inode)->ip_lock);
mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
OCFS2_I(inode)->ip_orphaned_slot
!= OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
"orphaned in slot %d, we think it's in %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
be32_to_cpu(resp->r_hdr.h_node_num),
orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
int ocfs2_request_delete_vote(struct inode *inode)
{
int orphaned_slot, status;
struct ocfs2_net_response_cb delete_cb;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_vote_msg *request;
spin_lock(&OCFS2_I(inode)->ip_lock);
orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
spin_unlock(&OCFS2_I(inode)->ip_lock);
delete_cb.rc_cb = ocfs2_delete_response_cb;
delete_cb.rc_priv = inode;
mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
status = -ENOMEM;
request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
inode->i_generation,
OCFS2_VOTE_REQ_DELETE, orphaned_slot);
if (request) {
status = ocfs2_request_vote(inode, request, &delete_cb);
kfree(request);
}
return status;
}
int ocfs2_request_mount_vote(struct ocfs2_super *osb) int ocfs2_request_mount_vote(struct ocfs2_super *osb)
{ {
int status; int status;
struct ocfs2_vote_msg *request = NULL; struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0, request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
OCFS2_VOTE_REQ_MOUNT, 0);
if (!request) { if (!request) {
status = -ENOMEM; status = -ENOMEM;
goto bail; goto bail;
...@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) ...@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
int status; int status;
struct ocfs2_vote_msg *request = NULL; struct ocfs2_vote_msg *request = NULL;
request = ocfs2_new_vote_request(osb, 0ULL, 0, request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
OCFS2_VOTE_REQ_UMOUNT, 0);
if (!request) { if (!request) {
status = -ENOMEM; status = -ENOMEM;
goto bail; goto bail;
...@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, ...@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
be32_to_cpu(work->w_msg.v_hdr.h_generation)); be32_to_cpu(work->w_msg.v_hdr.h_generation));
mlog(0, "h_node_num = %u\n", mlog(0, "h_node_num = %u\n",
be32_to_cpu(work->w_msg.v_hdr.h_node_num)); be32_to_cpu(work->w_msg.v_hdr.h_node_num));
mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
spin_lock(&osb->vote_task_lock); spin_lock(&osb->vote_task_lock);
list_add_tail(&work->w_list, &osb->vote_list); list_add_tail(&work->w_list, &osb->vote_list);
......
...@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) ...@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
wake_up(&osb->vote_event); wake_up(&osb->vote_event);
} }
int ocfs2_request_delete_vote(struct inode *inode);
int ocfs2_request_mount_vote(struct ocfs2_super *osb); int ocfs2_request_mount_vote(struct ocfs2_super *osb);
int ocfs2_request_umount_vote(struct ocfs2_super *osb); int ocfs2_request_umount_vote(struct ocfs2_super *osb);
int ocfs2_register_net_handlers(struct ocfs2_super *osb); int ocfs2_register_net_handlers(struct ocfs2_super *osb);
void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
int node_num); int node_num);
#endif #endif
...@@ -239,13 +239,11 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, ...@@ -239,13 +239,11 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
/* /*
* `endbyte' is inclusive * `endbyte' is inclusive
*/ */
int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
unsigned int flags) loff_t endbyte, unsigned int flags)
{ {
int ret; int ret;
struct address_space *mapping;
mapping = file->f_mapping;
if (!mapping) { if (!mapping) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
...@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, ...@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
out: out:
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(do_sync_file_range); EXPORT_SYMBOL_GPL(do_sync_mapping_range);
...@@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); ...@@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp); extern int fcntl_getlease(struct file *filp);
/* fs/sync.c */ /* fs/sync.c */
extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
unsigned int flags); loff_t endbyte, unsigned int flags);
static inline int do_sync_file_range(struct file *file, loff_t offset,
loff_t endbyte, unsigned int flags)
{
return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
}
/* fs/locks.c */ /* fs/locks.c */
extern void locks_init_lock(struct file_lock *); extern void locks_init_lock(struct file_lock *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment