Commit 8067253c authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Trond Myklebust

pnfs/blocklayout: rewrite extent tracking

Currently the block layout driver tracks extents in three separate
data structures:

 - the two list of pnfs_block_extent structures returned by the server
 - the list of sectors that were in invalid state but have been written to
 - a list of pnfs_block_short_extent structures for LAYOUTCOMMIT

All of these share the property that they are not only highly inefficient
data structures, but also that operations on them are even more inefficient
than nessecary.

In addition there are various implementation defects like:

 - using an int to track sectors, causing corruption for large offsets
 - incorrect normalization of page or block granularity ranges
 - insufficient error handling
 - incorrect synchronization as extents can be modified while they are in
   use

This patch replace all three data with a single unified rbtree structure
tracking all extents, as well as their in-memory state, although we still
need to instance for read-only and read-write extent due to the arcane
client side COW feature in the block layouts spec.

To fix the problem of extent possibly being modified while in use we make
sure to return a copy of the extent for use in the write path - the
extent can only be invalidated by a layout recall or return which has
to wait until the I/O operations finished due to refcounts on the layout
segment.

The new extent tree work similar to the schemes used by block based
filesystems like XFS or ext4.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarTrond Myklebust <trond.myklebust@primarydata.com>
parent 8c792ea9
......@@ -2,4 +2,5 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
extent_tree.o
This diff is collapsed.
......@@ -63,82 +63,28 @@ enum exstate4 {
PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
struct my_tree {
sector_t mtt_step_size; /* Internal sector alignment */
struct list_head mtt_stub; /* Should be a radix tree */
};
struct pnfs_inval_markings {
spinlock_t im_lock;
struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
sector_t im_block_size; /* Server blocksize in sectors */
struct list_head im_extents; /* Short extents for INVAL->RW conversion */
};
struct pnfs_inval_tracking {
struct list_head it_link;
int it_sector;
int it_tags;
};
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
struct kref be_refcnt;
struct list_head be_node; /* link into lseg list */
union {
struct rb_node be_node;
struct list_head be_list;
};
struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
struct block_device *be_mdev;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
enum exstate4 be_state; /* the state of this extent */
struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
};
/* Shortened extent used by LAYOUTCOMMIT */
struct pnfs_block_short_extent {
struct list_head bse_node;
struct nfs4_deviceid bse_devid;
struct block_device *bse_mdev;
sector_t bse_f_offset; /* the starting offset in the file */
sector_t bse_length; /* the size of the extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
static inline void
BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
{
spin_lock_init(&marks->im_lock);
INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
INIT_LIST_HEAD(&marks->im_extents);
marks->im_block_size = blocksize;
marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
blocksize);
}
enum extentclass4 {
RW_EXTENT = 0, /* READWRTE and INVAL */
RO_EXTENT = 1, /* READ and NONE */
EXTENT_LISTS = 2,
};
static inline int bl_choose_list(enum exstate4 state)
{
if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
return RO_EXTENT;
else
return RW_EXTENT;
}
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
struct rb_root bl_ext_rw;
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
struct list_head bl_commit; /* Needs layout commit */
struct list_head bl_committing; /* Layout committing */
unsigned int bl_count; /* entries in bl_commit */
sector_t bl_blocksize; /* Server blocksize in sectors */
};
#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
......@@ -183,29 +129,17 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
/* blocklayoutdm.c */
void bl_free_block_dev(struct pnfs_block_dev *bdev);
/* extents.c */
struct pnfs_block_extent *
bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent **cow_read);
int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
sector_t offset, sector_t length);
void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *bl_alloc_extent(void);
int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *arg);
void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
const struct nfs4_layoutcommit_args *arg,
int status);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
int bl_mark_for_commit(struct pnfs_block_extent *be,
sector_t offset, sector_t length,
struct pnfs_block_short_extent *new);
int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
struct pnfs_block_short_extent *
bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
sector_t end);
int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t len);
bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw);
int ext_tree_encode_commit(struct pnfs_block_layout *bl,
struct xdr_stream *xdr);
void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
......@@ -309,7 +309,7 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
* recovery easier.
*/
for (i = 0; i < count; i++) {
be = bl_alloc_extent();
be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
if (!be) {
status = -ENOMEM;
goto out_err;
......@@ -330,13 +330,11 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
if (decode_sector_number(&p, &be->be_v_offset) < 0)
goto out_err;
be->be_state = be32_to_cpup(p++);
if (be->be_state == PNFS_BLOCK_INVALID_DATA)
be->be_inval = &bl->bl_inval;
if (verify_extent(be, &lv)) {
dprintk("%s verify failed\n", __func__);
goto out_err;
}
list_add_tail(&be->be_node, &extents);
list_add_tail(&be->be_list, &extents);
}
if (lgr->range.offset + lgr->range.length !=
lv.start << SECTOR_SHIFT) {
......@@ -352,21 +350,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
/* Extents decoded properly, now try to merge them in to
* existing layout extents.
*/
spin_lock(&bl->bl_ext_lock);
list_for_each_entry_safe(be, save, &extents, be_node) {
list_del(&be->be_node);
status = bl_add_merge_extent(bl, be);
if (status) {
spin_unlock(&bl->bl_ext_lock);
/* This is a fairly catastrophic error, as the
* entire layout extent lists are now corrupted.
* We should have some way to distinguish this.
*/
be = NULL;
goto out_err;
}
list_for_each_entry_safe(be, save, &extents, be_list) {
list_del(&be->be_list);
status = ext_tree_insert(bl, be);
if (status)
goto out_free_list;
}
spin_unlock(&bl->bl_ext_lock);
status = 0;
out:
__free_page(scratch);
......@@ -374,12 +364,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
return status;
out_err:
bl_put_extent(be);
kfree(be);
out_free_list:
while (!list_empty(&extents)) {
be = list_first_entry(&extents, struct pnfs_block_extent,
be_node);
list_del(&be->be_node);
bl_put_extent(be);
be_list);
list_del(&be->be_list);
kfree(be);
}
goto out;
}
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment