Commit cd1acdf1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'pnfs-submit' of git://git.open-osd.org/linux-open-osd

* 'pnfs-submit' of git://git.open-osd.org/linux-open-osd: (32 commits)
  pnfs-obj: pg_test check for max_io_size
  NFSv4.1: define nfs_generic_pg_test
  NFSv4.1: use pnfs_generic_pg_test directly by layout driver
  NFSv4.1: change pg_test return type to bool
  NFSv4.1: unify pnfs_pageio_init functions
  pnfs-obj: objlayout_encode_layoutcommit implementation
  pnfs: encode_layoutcommit
  pnfs-obj: report errors and .encode_layoutreturn Implementation.
  pnfs: encode_layoutreturn
  pnfs: layoutret_on_setattr
  pnfs: layoutreturn
  pnfs-obj: osd raid engine read/write implementation
  pnfs: support for non-rpc layout drivers
  pnfs-obj: define per-inode private structure
  pnfs: alloc and free layout_hdr layoutdriver methods
  pnfs-obj: objio_osd device information retrieval and caching
  pnfs-obj: decode layout, alloc/free lseg
  pnfs-obj: pnfs_osd XDR client implementation
  pnfs-obj: pnfs_osd XDR definitions
  pnfs-obj: objlayoutdriver module skeleton
  ...
parents fac04863 93420770
...@@ -87,6 +87,16 @@ config NFS_V4_1 ...@@ -87,6 +87,16 @@ config NFS_V4_1
config PNFS_FILE_LAYOUT config PNFS_FILE_LAYOUT
tristate tristate
config PNFS_OBJLAYOUT
tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
help
Say M here if you want your pNFS client to support the Objects Layout Driver.
Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
upper level driver (SCSI_OSD_ULD).
If unsure, say N.
config ROOT_NFS config ROOT_NFS
bool "Root file system on NFS" bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP depends on NFS_FS=y && IP_PNP
......
...@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ ...@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
delegation.o idmap.o \ delegation.o idmap.o \
callback.o callback_xdr.o callback_proc.o \ callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o nfs4namespace.o
nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
...@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall( ...@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp); extern void nfs4_cb_take_slot(struct nfs_client *clp);
struct cb_devicenotifyitem {
uint32_t cbd_notify_type;
uint32_t cbd_layout_type;
struct nfs4_deviceid cbd_dev_id;
uint32_t cbd_immediate;
};
struct cb_devicenotifyargs {
int ndevs;
struct cb_devicenotifyitem *devs;
};
extern __be32 nfs4_callback_devicenotify(
struct cb_devicenotifyargs *args,
void *dummy, struct cb_process_state *cps);
#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
......
...@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, ...@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list, mark_matching_lsegs_invalid(lo, &free_me_list,
args->cbl_range.iomode)) &args->cbl_range))
rv = NFS4ERR_DELAY; rv = NFS4ERR_DELAY;
else else
rv = NFS4ERR_NOMATCHING_LAYOUT; rv = NFS4ERR_NOMATCHING_LAYOUT;
...@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, ...@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
ino = lo->plh_inode; ino = lo->plh_inode;
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
rv = NFS4ERR_DELAY; rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall); list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
...@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) ...@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
do_callback_layoutrecall(clp, &args); do_callback_layoutrecall(clp, &args);
} }
__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
void *dummy, struct cb_process_state *cps)
{
int i;
__be32 res = 0;
struct nfs_client *clp = cps->clp;
struct nfs_server *server = NULL;
dprintk("%s: -->\n", __func__);
if (!clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
}
for (i = 0; i < args->ndevs; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
if (!server ||
server->pnfs_curr_ld->id != dev->cbd_layout_type) {
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
if (server->pnfs_curr_ld &&
server->pnfs_curr_ld->id == dev->cbd_layout_type) {
rcu_read_unlock();
goto found;
}
rcu_read_unlock();
dprintk("%s: layout type %u not found\n",
__func__, dev->cbd_layout_type);
continue;
}
found:
if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
"deleting instead\n", __func__);
nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
}
out:
kfree(args->devs);
dprintk("%s: exit with status = %u\n",
__func__, be32_to_cpu(res));
return res;
}
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{ {
if (delegation == NULL) if (delegation == NULL)
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#if defined(CONFIG_NFS_V4_1) #if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3) 4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
...@@ -284,6 +285,93 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, ...@@ -284,6 +285,93 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
return status; return status;
} }
static
__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
struct cb_devicenotifyargs *args)
{
__be32 *p;
__be32 status = 0;
u32 tmp;
int n, i;
args->ndevs = 0;
/* Num of device notifications */
p = read_buf(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto out;
}
n = ntohl(*p++);
if (n <= 0)
goto out;
args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
status = htonl(NFS4ERR_DELAY);
goto out;
}
/* Decode each dev notification */
for (i = 0; i < n; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
}
tmp = ntohl(*p++); /* bitmap size */
if (tmp != 1) {
status = htonl(NFS4ERR_INVAL);
goto err;
}
dev->cbd_notify_type = ntohl(*p++);
if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
status = htonl(NFS4ERR_INVAL);
goto err;
}
tmp = ntohl(*p++); /* opaque size */
if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
(tmp != NFS4_DEVICEID4_SIZE + 8)) ||
((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
(tmp != NFS4_DEVICEID4_SIZE + 4))) {
status = htonl(NFS4ERR_INVAL);
goto err;
}
dev->cbd_layout_type = ntohl(*p++);
memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
p = read_buf(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
}
dev->cbd_immediate = ntohl(*p++);
} else {
dev->cbd_immediate = 0;
}
args->ndevs++;
dprintk("%s: type %d layout 0x%x immediate %d\n",
__func__, dev->cbd_notify_type, dev->cbd_layout_type,
dev->cbd_immediate);
}
out:
dprintk("%s: status %d ndevs %d\n",
__func__, ntohl(status), args->ndevs);
return status;
err:
kfree(args->devs);
goto out;
}
static __be32 decode_sessionid(struct xdr_stream *xdr, static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid) struct nfs4_sessionid *sid)
{ {
...@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) ...@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL_ANY: case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT: case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL: case OP_CB_LAYOUTRECALL:
case OP_CB_NOTIFY_DEVICEID:
*op = &callback_ops[op_nr]; *op = &callback_ops[op_nr];
break; break;
case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY: case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG: case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_RECALLABLE_OBJ_AVAIL:
...@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = { ...@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
(callback_decode_arg_t)decode_layoutrecall_args, (callback_decode_arg_t)decode_layoutrecall_args,
.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
}, },
[OP_CB_NOTIFY_DEVICEID] = {
.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
.decode_args =
(callback_decode_arg_t)decode_devicenotify_args,
.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
},
[OP_CB_SEQUENCE] = { [OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence, .process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
......
...@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp) ...@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
if (clp->cl_machine_cred != NULL) if (clp->cl_machine_cred != NULL)
put_rpccred(clp->cl_machine_cred); put_rpccred(clp->cl_machine_cred);
nfs4_deviceid_purge_client(clp);
kfree(clp->cl_hostname); kfree(clp->cl_hostname);
kfree(clp); kfree(clp);
......
...@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en ...@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct page **xdr_pages, struct page *page, unsigned int buflen) struct page **xdr_pages, struct page *page, unsigned int buflen)
{ {
struct xdr_stream stream; struct xdr_stream stream;
struct xdr_buf buf = { struct xdr_buf buf;
.pages = xdr_pages,
.page_len = buflen,
.buflen = buflen,
.len = buflen,
};
struct page *scratch; struct page *scratch;
struct nfs_cache_array *array; struct nfs_cache_array *array;
unsigned int count = 0; unsigned int count = 0;
...@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en ...@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (scratch == NULL) if (scratch == NULL)
return -ENOMEM; return -ENOMEM;
xdr_init_decode(&stream, &buf, NULL); xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do { do {
......
...@@ -1428,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) ...@@ -1428,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/ */
void nfs4_evict_inode(struct inode *inode) void nfs4_evict_inode(struct inode *inode)
{ {
pnfs_destroy_layout(NFS_I(inode));
truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode); end_writeback(inode);
pnfs_return_layout(inode);
pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */ /* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode); nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */ /* First call standard NFS clear_inode() code */
......
...@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *, ...@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
#endif #endif
/* nfs4proc.c */ /* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_read_data *);
extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
extern int nfs4_init_client(struct nfs_client *clp, extern int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct rpc_timeout *timeparms,
......
...@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, ...@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid *id, struct nfs4_deviceid *id,
gfp_t gfp_flags) gfp_t gfp_flags)
{ {
struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr; struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL; int status = -EINVAL;
struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
...@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, ...@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__); dprintk("--> %s\n", __func__);
if (fl->pattern_offset > lgr->range.offset) { if (fl->pattern_offset > lgr->range.offset) {
dprintk("%s pattern_offset %lld to large\n", dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset); __func__, fl->pattern_offset);
goto out; goto out;
} }
...@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, ...@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
} }
/* find and reference the deviceid */ /* find and reference the deviceid */
dsaddr = nfs4_fl_find_get_deviceid(id); d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
if (dsaddr == NULL) { NFS_SERVER(lo->plh_inode)->nfs_client, id);
if (d == NULL) {
dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
if (dsaddr == NULL) if (dsaddr == NULL)
goto out; goto out;
} } else
dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
fl->dsaddr = dsaddr; fl->dsaddr = dsaddr;
if (fl->first_stripe_index < 0 || if (fl->first_stripe_index < 0 ||
...@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, ...@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
gfp_t gfp_flags) gfp_t gfp_flags)
{ {
struct xdr_stream stream; struct xdr_stream stream;
struct xdr_buf buf = { struct xdr_buf buf;
.pages = lgr->layoutp->pages,
.page_len = lgr->layoutp->len,
.buflen = lgr->layoutp->len,
.len = lgr->layoutp->len,
};
struct page *scratch; struct page *scratch;
__be32 *p; __be32 *p;
uint32_t nfl_util; uint32_t nfl_util;
...@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, ...@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (!scratch) if (!scratch)
return -ENOMEM; return -ENOMEM;
xdr_init_decode(&stream, &buf, NULL); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
...@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, ...@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
memcpy(id, p, sizeof(*id)); memcpy(id, p, sizeof(*id));
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
print_deviceid(id); nfs4_print_deviceid(id);
nfl_util = be32_to_cpup(p++); nfl_util = be32_to_cpup(p++);
if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
...@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, ...@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/* /*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests() * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
* *
* return 1 : coalesce page * return true : coalesce page
* return 0 : don't coalesce page * return false : don't coalesce page
*/ */
int bool
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req) struct nfs_page *req)
{ {
u64 p_stripe, r_stripe; u64 p_stripe, r_stripe;
u32 stripe_unit; u32 stripe_unit;
if (!pnfs_generic_pg_test(pgio, prev, req))
return 0;
if (!pgio->pg_lseg) if (!pgio->pg_lseg)
return 1; return 1;
p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
...@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ...@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
return -ENOMEM; return -ENOMEM;
} }
static void
filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
{
nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
}
static struct pnfs_layoutdriver_type filelayout_type = { static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES, .id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES", .name = "LAYOUT_NFSV4_1_FILES",
...@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { ...@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist, .commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist, .read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist, .write_pagelist = filelayout_write_pagelist,
.free_deviceid_node = filelayout_free_deveiceid_node,
}; };
static int __init nfs4filelayout_init(void) static int __init nfs4filelayout_init(void)
......
...@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds { ...@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 #define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
struct nfs4_file_layout_dsaddr { struct nfs4_file_layout_dsaddr {
struct hlist_node node; struct nfs4_deviceid_node id_node;
struct nfs4_deviceid deviceid;
atomic_t ref;
unsigned long flags; unsigned long flags;
u32 stripe_count; u32 stripe_count;
u8 *stripe_indices; u8 *stripe_indices;
...@@ -95,14 +93,12 @@ extern struct nfs_fh * ...@@ -95,14 +93,12 @@ extern struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
extern void print_ds(struct nfs4_pnfs_ds *ds); extern void print_ds(struct nfs4_pnfs_ds *ds);
extern void print_deviceid(struct nfs4_deviceid *dev_id);
u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx); u32 ds_idx);
extern struct nfs4_file_layout_dsaddr *
nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr * struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
......
...@@ -36,30 +36,6 @@ ...@@ -36,30 +36,6 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD #define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
* Device ID RCU cache. A device ID is unique per client ID and layout type.
*/
#define NFS4_FL_DEVICE_ID_HASH_BITS 5
#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
static inline u32
nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
{
unsigned char *cptr = (unsigned char *)id->data;
unsigned int nbytes = NFS4_DEVICEID4_SIZE;
u32 x = 0;
while (nbytes--) {
x *= 37;
x += *cptr++;
}
return x & NFS4_FL_DEVICE_ID_HASH_MASK;
}
static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
static DEFINE_SPINLOCK(filelayout_deviceid_lock);
/* /*
* Data server cache * Data server cache
* *
...@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds) ...@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
} }
void
print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
{
int i;
ifdebug(FACILITY) {
printk("%s dsaddr->ds_num %d\n", __func__,
dsaddr->ds_num);
for (i = 0; i < dsaddr->ds_num; i++)
print_ds(dsaddr->ds_list[i]);
}
}
void print_deviceid(struct nfs4_deviceid *id)
{
u32 *p = (u32 *)id;
dprintk("%s: device id= [%x%x%x%x]\n", __func__,
p[0], p[1], p[2], p[3]);
}
/* nfs4_ds_cache_lock is held */ /* nfs4_ds_cache_lock is held */
static struct nfs4_pnfs_ds * static struct nfs4_pnfs_ds *
_data_server_lookup_locked(u32 ip_addr, u32 port) _data_server_lookup_locked(u32 ip_addr, u32 port)
...@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds) ...@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
kfree(ds); kfree(ds);
} }
static void void
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{ {
struct nfs4_pnfs_ds *ds; struct nfs4_pnfs_ds *ds;
int i; int i;
print_deviceid(&dsaddr->deviceid); nfs4_print_deviceid(&dsaddr->id_node.deviceid);
for (i = 0; i < dsaddr->ds_num; i++) { for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i]; ds = dsaddr->ds_list[i];
...@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) ...@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
u8 max_stripe_index; u8 max_stripe_index;
struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream; struct xdr_stream stream;
struct xdr_buf buf = { struct xdr_buf buf;
.pages = pdev->pages,
.page_len = pdev->pglen,
.buflen = pdev->pglen,
.len = pdev->pglen,
};
struct page *scratch; struct page *scratch;
/* set up xdr stream */ /* set up xdr stream */
...@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) ...@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
if (!scratch) if (!scratch)
goto out_err; goto out_err;
xdr_init_decode(&stream, &buf, NULL); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* Get the stripe count (number of stripe index) */ /* Get the stripe count (number of stripe index) */
...@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) ...@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices; dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL; stripe_indices = NULL;
dsaddr->ds_num = num; dsaddr->ds_num = num;
nfs4_init_deviceid_node(&dsaddr->id_node,
memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); NFS_SERVER(ino)->pnfs_curr_ld,
NFS_SERVER(ino)->nfs_client,
&pdev->dev_id);
for (i = 0; i < dsaddr->ds_num; i++) { for (i = 0; i < dsaddr->ds_num; i++) {
int j; int j;
...@@ -505,8 +457,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) ...@@ -505,8 +457,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
static struct nfs4_file_layout_dsaddr * static struct nfs4_file_layout_dsaddr *
decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
{ {
struct nfs4_file_layout_dsaddr *d, *new; struct nfs4_deviceid_node *d;
long hash; struct nfs4_file_layout_dsaddr *n, *new;
new = decode_device(inode, dev, gfp_flags); new = decode_device(inode, dev, gfp_flags);
if (!new) { if (!new) {
...@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl ...@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
return NULL; return NULL;
} }
spin_lock(&filelayout_deviceid_lock); d = nfs4_insert_deviceid_node(&new->id_node);
d = nfs4_fl_find_get_deviceid(&new->deviceid); n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
if (d) { if (n != new) {
spin_unlock(&filelayout_deviceid_lock);
nfs4_fl_free_deviceid(new); nfs4_fl_free_deviceid(new);
return d; return n;
} }
INIT_HLIST_NODE(&new->node);
atomic_set(&new->ref, 1);
hash = nfs4_fl_deviceid_hash(&new->deviceid);
hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
spin_unlock(&filelayout_deviceid_lock);
return new; return new;
} }
...@@ -600,35 +545,7 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_fla ...@@ -600,35 +545,7 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_fla
void void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{ {
if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { nfs4_put_deviceid_node(&dsaddr->id_node);
hlist_del_rcu(&dsaddr->node);
spin_unlock(&filelayout_deviceid_lock);
synchronize_rcu();
nfs4_fl_free_deviceid(dsaddr);
}
}
struct nfs4_file_layout_dsaddr *
nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
{
struct nfs4_file_layout_dsaddr *d;
struct hlist_node *n;
long hash = nfs4_fl_deviceid_hash(id);
rcu_read_lock();
hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
if (!memcmp(&d->deviceid, id, sizeof(*id))) {
if (!atomic_inc_not_zero(&d->ref))
goto fail;
rcu_read_unlock();
return d;
}
}
fail:
rcu_read_unlock();
return NULL;
} }
/* /*
...@@ -676,15 +593,15 @@ static void ...@@ -676,15 +593,15 @@ static void
filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
int err, u32 ds_addr) int err, u32 ds_addr)
{ {
u32 *p = (u32 *)&dsaddr->deviceid; u32 *p = (u32 *)&dsaddr->id_node.deviceid;
printk(KERN_ERR "NFS: data server %x connection error %d." printk(KERN_ERR "NFS: data server %x connection error %d."
" Deviceid [%x%x%x%x] marked out of use.\n", " Deviceid [%x%x%x%x] marked out of use.\n",
ds_addr, err, p[0], p[1], p[2], p[3]); ds_addr, err, p[0], p[1], p[2], p[3]);
spin_lock(&filelayout_deviceid_lock); spin_lock(&nfs4_ds_cache_lock);
dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
spin_unlock(&filelayout_deviceid_lock); spin_unlock(&nfs4_ds_cache_lock);
} }
struct nfs4_pnfs_ds * struct nfs4_pnfs_ds *
......
...@@ -2363,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, ...@@ -2363,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_state *state = NULL; struct nfs4_state *state = NULL;
int status; int status;
if (pnfs_ld_layoutret_on_setattr(inode))
pnfs_return_layout(inode);
nfs_fattr_init(fattr); nfs_fattr_init(fattr);
/* Search for an existing open(O_WRITE) file */ /* Search for an existing open(O_WRITE) file */
...@@ -3177,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, ...@@ -3177,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err; return err;
} }
void __nfs4_read_done_cb(struct nfs_read_data *data)
{
nfs_invalidate_atime(data->inode);
}
static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{ {
struct nfs_server *server = NFS_SERVER(data->inode); struct nfs_server *server = NFS_SERVER(data->inode);
...@@ -3186,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) ...@@ -3186,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN; return -EAGAIN;
} }
nfs_invalidate_atime(data->inode); __nfs4_read_done_cb(data);
if (task->tk_status > 0) if (task->tk_status > 0)
renew_lease(server, data->timestamp); renew_lease(server, data->timestamp);
return 0; return 0;
...@@ -3200,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) ...@@ -3200,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
if (!nfs4_sequence_done(task, &data->res.seq_res)) if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN; return -EAGAIN;
return data->read_done_cb(task, data); return data->read_done_cb ? data->read_done_cb(task, data) :
nfs4_read_done_cb(task, data);
} }
static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
...@@ -3245,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) ...@@ -3245,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{ {
if (!nfs4_sequence_done(task, &data->res.seq_res)) if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN; return -EAGAIN;
return data->write_done_cb(task, data); return data->write_done_cb ? data->write_done_cb(task, data) :
nfs4_write_done_cb(task, data);
} }
/* Reset the the nfs_write_data to send the write to the MDS. */ /* Reset the the nfs_write_data to send the write to the MDS. */
...@@ -5671,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ...@@ -5671,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
return status; return status;
} }
static void
nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
dprintk("--> %s\n", __func__);
if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
&lrp->res.seq_res, 0, task))
return;
rpc_call_start(task);
}
static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
struct nfs_server *server;
dprintk("--> %s\n", __func__);
if (!nfs4_sequence_done(task, &lrp->res.seq_res))
return;
server = NFS_SERVER(lrp->args.inode);
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
nfs_restart_rpc(task, lrp->clp);
return;
}
if (task->tk_status == 0) {
struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
if (lrp->res.lrs_present) {
spin_lock(&lo->plh_inode->i_lock);
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
spin_unlock(&lo->plh_inode->i_lock);
} else
BUG_ON(!list_empty(&lo->plh_segs));
}
dprintk("<-- %s\n", __func__);
}
static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
dprintk("--> %s\n", __func__);
put_layout_hdr(NFS_I(lrp->args.inode)->layout);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
.rpc_call_prepare = nfs4_layoutreturn_prepare,
.rpc_call_done = nfs4_layoutreturn_done,
.rpc_release = nfs4_layoutreturn_release,
};
int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
{
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
.rpc_argp = &lrp->args,
.rpc_resp = &lrp->res,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = lrp->clp->cl_rpcclient,
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
};
int status;
dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
status = task->tk_status;
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
}
static int static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{ {
......
...@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int); ...@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
1 /* layoutupdate4 layout type */ + \ 1 /* layoutupdate4 layout type */ + \
1 /* NULL filelayout layoutupdate4 payload */) 1 /* NULL filelayout layoutupdate4 payload */)
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
1 /* FIXME: opaque lrf_body always empty at the moment */)
#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
1 + decode_stateid_maxsz)
#else /* CONFIG_NFS_V4_1 */ #else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0 #define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0 #define decode_sequence_maxsz 0
...@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int); ...@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
decode_putfh_maxsz + \ decode_putfh_maxsz + \
decode_layoutcommit_maxsz + \ decode_layoutcommit_maxsz + \
decode_getattr_maxsz) decode_getattr_maxsz)
#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_layoutreturn_maxsz)
#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_layoutreturn_maxsz)
const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
compound_encode_hdr_maxsz + compound_encode_hdr_maxsz +
...@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr, ...@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int static int
encode_layoutcommit(struct xdr_stream *xdr, encode_layoutcommit(struct xdr_stream *xdr,
struct inode *inode,
const struct nfs4_layoutcommit_args *args, const struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr) struct compound_hdr *hdr)
{ {
...@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr, ...@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
NFS_SERVER(args->inode)->pnfs_curr_ld->id); NFS_SERVER(args->inode)->pnfs_curr_ld->id);
p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(OP_LAYOUTCOMMIT); *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
/* Only whole file layouts */ /* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */ p = xdr_encode_hyper(p, 0); /* offset */
...@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr, ...@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten); p = xdr_encode_hyper(p, args->lastbytewritten);
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
*p++ = cpu_to_be32(0); /* no file layout payload */
if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
NFS_I(inode)->layout, xdr, args);
else {
p = reserve_space(xdr, 4);
*p = cpu_to_be32(0); /* no layout-type payload */
}
hdr->nops++; hdr->nops++;
hdr->replen += decode_layoutcommit_maxsz; hdr->replen += decode_layoutcommit_maxsz;
return 0; return 0;
} }
static void
encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
p = reserve_space(xdr, 20);
*p++ = cpu_to_be32(OP_LAYOUTRETURN);
*p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
*p++ = cpu_to_be32(args->layout_type);
*p++ = cpu_to_be32(IOMODE_ANY);
*p = cpu_to_be32(RETURN_FILE);
p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
p = xdr_encode_hyper(p, 0);
p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
spin_lock(&args->inode->i_lock);
xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
spin_unlock(&args->inode->i_lock);
if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
NFS_I(args->inode)->layout, xdr, args);
} else {
p = reserve_space(xdr, 4);
*p = cpu_to_be32(0);
}
hdr->nops++;
hdr->replen += decode_layoutreturn_maxsz;
}
#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4_1 */
/* /*
...@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, ...@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
/* /*
* Encode LAYOUTCOMMIT request * Encode LAYOUTCOMMIT request
*/ */
static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
struct xdr_stream *xdr, struct xdr_stream *xdr,
struct nfs4_layoutcommit_args *args) struct nfs4_layoutcommit_args *args)
{ {
struct nfs4_layoutcommit_data *data =
container_of(args, struct nfs4_layoutcommit_data, args);
struct compound_hdr hdr = { struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args), .minorversion = nfs4_xdr_minorversion(&args->seq_args),
}; };
...@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, ...@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr); encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr); encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_putfh(xdr, NFS_FH(args->inode), &hdr);
encode_layoutcommit(xdr, args, &hdr); encode_layoutcommit(xdr, data->args.inode, args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr); encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr); encode_nops(&hdr);
return 0; }
/*
* Encode LAYOUTRETURN request
*/
static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs4_layoutreturn_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
encode_layoutreturn(xdr, args, &hdr);
encode_nops(&hdr);
} }
#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4_1 */
...@@ -5203,6 +5271,27 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ...@@ -5203,6 +5271,27 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
return -EIO; return -EIO;
} }
static int decode_layoutreturn(struct xdr_stream *xdr,
struct nfs4_layoutreturn_res *res)
{
__be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
if (status)
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_stateid(xdr, &res->stateid);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
}
static int decode_layoutcommit(struct xdr_stream *xdr, static int decode_layoutcommit(struct xdr_stream *xdr,
struct rpc_rqst *req, struct rpc_rqst *req,
struct nfs4_layoutcommit_res *res) struct nfs4_layoutcommit_res *res)
...@@ -6319,6 +6408,30 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, ...@@ -6319,6 +6408,30 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
return status; return status;
} }
/*
* Decode LAYOUTRETURN response
*/
static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
struct xdr_stream *xdr,
struct nfs4_layoutreturn_res *res)
{
struct compound_hdr hdr;
int status;
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
status = decode_sequence(xdr, &res->seq_res, rqstp);
if (status)
goto out;
status = decode_putfh(xdr);
if (status)
goto out;
status = decode_layoutreturn(xdr, res);
out:
return status;
}
/* /*
* Decode LAYOUTCOMMIT response * Decode LAYOUTCOMMIT response
*/ */
...@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = { ...@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget), PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4_1 */
}; };
......
#
# Makefile for the pNFS Objects Layout Driver kernel module
#
objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
/*
* pNFS Objects layout implementation over open-osd initiator library
*
* Copyright (C) 2009 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/module.h>
#include <scsi/osd_initiator.h>
#include "objlayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
#define _LLU(x) ((unsigned long long)x)
enum { BIO_MAX_PAGES_KMALLOC =
(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
};
struct objio_dev_ent {
struct nfs4_deviceid_node id_node;
struct osd_dev *od;
};
static void
objio_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
dprintk("%s: free od=%p\n", __func__, de->od);
osduld_put_device(de->od);
kfree(de);
}
static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
const struct nfs4_deviceid *d_id)
{
struct nfs4_deviceid_node *d;
struct objio_dev_ent *de;
d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
if (!d)
return NULL;
de = container_of(d, struct objio_dev_ent, id_node);
return de;
}
static struct objio_dev_ent *
_dev_list_add(const struct nfs_server *nfss,
const struct nfs4_deviceid *d_id, struct osd_dev *od,
gfp_t gfp_flags)
{
struct nfs4_deviceid_node *d;
struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
struct objio_dev_ent *n;
if (!de) {
dprintk("%s: -ENOMEM od=%p\n", __func__, od);
return NULL;
}
dprintk("%s: Adding od=%p\n", __func__, od);
nfs4_init_deviceid_node(&de->id_node,
nfss->pnfs_curr_ld,
nfss->nfs_client,
d_id);
de->od = od;
d = nfs4_insert_deviceid_node(&de->id_node);
n = container_of(d, struct objio_dev_ent, id_node);
if (n != de) {
dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
objio_free_deviceid_node(&de->id_node);
de = n;
}
atomic_inc(&de->id_node.ref);
return de;
}
struct caps_buffers {
u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
u8 creds[OSD_CAP_LEN];
};
struct objio_segment {
struct pnfs_layout_segment lseg;
struct pnfs_osd_object_cred *comps;
unsigned mirrors_p1;
unsigned stripe_unit;
unsigned group_width; /* Data stripe_units without integrity comps */
u64 group_depth;
unsigned group_count;
unsigned max_io_size;
unsigned comps_index;
unsigned num_comps;
/* variable length */
struct objio_dev_ent *ods[];
};
static inline struct objio_segment *
OBJIO_LSEG(struct pnfs_layout_segment *lseg)
{
return container_of(lseg, struct objio_segment, lseg);
}
struct objio_state;
typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
struct objio_state {
/* Generic layer */
struct objlayout_io_state ol_state;
struct objio_segment *layout;
struct kref kref;
objio_done_fn done;
void *private;
unsigned long length;
unsigned numdevs; /* Actually used devs in this IO */
/* A per-device variable array of size numdevs */
struct _objio_per_comp {
struct bio *bio;
struct osd_request *or;
unsigned long length;
u64 offset;
unsigned dev;
} per_dev[];
};
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
struct objio_segment *objio_seg, unsigned comp,
gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
struct nfs4_deviceid *d_id;
struct objio_dev_ent *ode;
struct osd_dev *od;
struct osd_dev_info odi;
int err;
d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
if (ode)
return ode;
err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
if (unlikely(err)) {
dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
return ERR_PTR(err);
}
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
err = -EINVAL;
goto out;
} else if (odi.systemid_len)
memcpy(odi.systemid, deviceaddr->oda_systemid.data,
odi.systemid_len);
odi.osdname_len = deviceaddr->oda_osdname.len;
odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
if (!odi.osdname_len && !odi.systemid_len) {
dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
__func__);
err = -ENODEV;
goto out;
}
od = osduld_info_lookup(&odi);
if (unlikely(IS_ERR(od))) {
err = PTR_ERR(od);
dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
goto out;
}
ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
gfp_flags);
out:
dprintk("%s: return=%d\n", __func__, err);
objlayout_put_deviceinfo(deviceaddr);
return err ? ERR_PTR(err) : ode;
}
static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
struct objio_segment *objio_seg,
gfp_t gfp_flags)
{
unsigned i;
int err;
/* lookup all devices */
for (i = 0; i < objio_seg->num_comps; i++) {
struct objio_dev_ent *ode;
ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
if (unlikely(IS_ERR(ode))) {
err = PTR_ERR(ode);
goto out;
}
objio_seg->ods[i] = ode;
}
err = 0;
out:
dprintk("%s: return=%d\n", __func__, err);
return err;
}
static int _verify_data_map(struct pnfs_osd_layout *layout)
{
struct pnfs_osd_data_map *data_map = &layout->olo_map;
u64 stripe_length;
u32 group_width;
/* FIXME: Only raid0 for now. if not go through MDS */
if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
printk(KERN_ERR "Only RAID_0 for now\n");
return -ENOTSUPP;
}
if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
data_map->odm_num_comps, data_map->odm_mirror_cnt);
return -EINVAL;
}
if (data_map->odm_group_width)
group_width = data_map->odm_group_width;
else
group_width = data_map->odm_num_comps /
(data_map->odm_mirror_cnt + 1);
stripe_length = (u64)data_map->odm_stripe_unit * group_width;
if (stripe_length >= (1ULL << 32)) {
printk(KERN_ERR "Total Stripe length(0x%llx)"
" >= 32bit is not supported\n", _LLU(stripe_length));
return -ENOTSUPP;
}
if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
printk(KERN_ERR "Stripe Unit(0x%llx)"
" must be Multples of PAGE_SIZE(0x%lx)\n",
_LLU(data_map->odm_stripe_unit), PAGE_SIZE);
return -ENOTSUPP;
}
return 0;
}
static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
struct pnfs_osd_object_cred *src_comp,
struct caps_buffers *caps_p)
{
WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
*cur_comp = *src_comp;
memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
sizeof(caps_p->caps_key));
cur_comp->oc_cap_key.cred = caps_p->caps_key;
memcpy(caps_p->creds, src_comp->oc_cap.cred,
sizeof(caps_p->creds));
cur_comp->oc_cap.cred = caps_p->creds;
}
int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_range *range,
struct xdr_stream *xdr,
gfp_t gfp_flags)
{
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
struct pnfs_osd_object_cred *cur_comp, src_comp;
struct caps_buffers *caps_p;
int err;
err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
if (unlikely(err))
return err;
err = _verify_data_map(&layout);
if (unlikely(err))
return err;
objio_seg = kzalloc(sizeof(*objio_seg) +
sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
sizeof(*objio_seg->comps) * layout.olo_num_comps +
sizeof(struct caps_buffers) * layout.olo_num_comps,
gfp_flags);
if (!objio_seg)
return -ENOMEM;
objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
cur_comp = objio_seg->comps;
caps_p = (void *)(cur_comp + layout.olo_num_comps);
while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
copy_single_comp(cur_comp++, &src_comp, caps_p++);
if (unlikely(err))
goto err;
objio_seg->num_comps = layout.olo_num_comps;
objio_seg->comps_index = layout.olo_comps_index;
err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
if (err)
goto err;
objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
if (layout.olo_map.odm_group_width) {
objio_seg->group_width = layout.olo_map.odm_group_width;
objio_seg->group_depth = layout.olo_map.odm_group_depth;
objio_seg->group_count = layout.olo_map.odm_num_comps /
objio_seg->mirrors_p1 /
objio_seg->group_width;
} else {
objio_seg->group_width = layout.olo_map.odm_num_comps /
objio_seg->mirrors_p1;
objio_seg->group_depth = -1;
objio_seg->group_count = 1;
}
/* Cache this calculation it will hit for every page */
objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
objio_seg->stripe_unit) *
objio_seg->group_width;
*outp = &objio_seg->lseg;
return 0;
err:
kfree(objio_seg);
dprintk("%s: Error: return %d\n", __func__, err);
*outp = NULL;
return err;
}
void objio_free_lseg(struct pnfs_layout_segment *lseg)
{
int i;
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
for (i = 0; i < objio_seg->num_comps; i++) {
if (!objio_seg->ods[i])
break;
nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
}
kfree(objio_seg);
}
int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
struct objlayout_io_state **outp,
gfp_t gfp_flags)
{
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
struct objio_state *ios;
const unsigned first_size = sizeof(*ios) +
objio_seg->num_comps * sizeof(ios->per_dev[0]);
const unsigned sec_size = objio_seg->num_comps *
sizeof(ios->ol_state.ioerrs[0]);
ios = kzalloc(first_size + sec_size, gfp_flags);
if (unlikely(!ios))
return -ENOMEM;
ios->layout = objio_seg;
ios->ol_state.ioerrs = ((void *)ios) + first_size;
ios->ol_state.num_comps = objio_seg->num_comps;
*outp = &ios->ol_state;
return 0;
}
void objio_free_io_state(struct objlayout_io_state *ol_state)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
kfree(ios);
}
enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
{
switch (oep) {
case OSD_ERR_PRI_NO_ERROR:
return (enum pnfs_osd_errno)0;
case OSD_ERR_PRI_CLEAR_PAGES:
BUG_ON(1);
return 0;
case OSD_ERR_PRI_RESOURCE:
return PNFS_OSD_ERR_RESOURCE;
case OSD_ERR_PRI_BAD_CRED:
return PNFS_OSD_ERR_BAD_CRED;
case OSD_ERR_PRI_NO_ACCESS:
return PNFS_OSD_ERR_NO_ACCESS;
case OSD_ERR_PRI_UNREACHABLE:
return PNFS_OSD_ERR_UNREACHABLE;
case OSD_ERR_PRI_NOT_FOUND:
return PNFS_OSD_ERR_NOT_FOUND;
case OSD_ERR_PRI_NO_SPACE:
return PNFS_OSD_ERR_NO_SPACE;
default:
WARN_ON(1);
/* fallthrough */
case OSD_ERR_PRI_EIO:
return PNFS_OSD_ERR_EIO;
}
}
static void _clear_bio(struct bio *bio)
{
struct bio_vec *bv;
unsigned i;
__bio_for_each_segment(bv, bio, i, 0) {
unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count))
clear_highpage(bv->bv_page);
else
zero_user(bv->bv_page, bv->bv_offset, this_count);
}
}
static int _io_check(struct objio_state *ios, bool is_write)
{
enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
int lin_ret = 0;
int i;
for (i = 0; i < ios->numdevs; i++) {
struct osd_sense_info osi;
struct osd_request *or = ios->per_dev[i].or;
unsigned dev;
int ret;
if (!or)
continue;
ret = osd_req_decode_sense(or, &osi);
if (likely(!ret))
continue;
if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
/* start read offset passed endof file */
BUG_ON(is_write);
_clear_bio(ios->per_dev[i].bio);
dprintk("%s: start read offset passed end of file "
"offset=0x%llx, length=0x%lx\n", __func__,
_LLU(ios->per_dev[i].offset),
ios->per_dev[i].length);
continue; /* we recovered */
}
dev = ios->per_dev[i].dev;
objlayout_io_set_result(&ios->ol_state, dev,
&ios->layout->comps[dev].oc_object_id,
osd_pri_2_pnfs_err(osi.osd_err_pri),
ios->per_dev[i].offset,
ios->per_dev[i].length,
is_write);
if (osi.osd_err_pri >= oep) {
oep = osi.osd_err_pri;
lin_ret = ret;
}
}
return lin_ret;
}
/*
* Common IO state helpers.
*/
static void _io_free(struct objio_state *ios)
{
unsigned i;
for (i = 0; i < ios->numdevs; i++) {
struct _objio_per_comp *per_dev = &ios->per_dev[i];
if (per_dev->or) {
osd_end_request(per_dev->or);
per_dev->or = NULL;
}
if (per_dev->bio) {
bio_put(per_dev->bio);
per_dev->bio = NULL;
}
}
}
struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
{
unsigned min_dev = ios->layout->comps_index;
unsigned max_dev = min_dev + ios->layout->num_comps;
BUG_ON(dev < min_dev || max_dev <= dev);
return ios->layout->ods[dev - min_dev]->od;
}
struct _striping_info {
u64 obj_offset;
u64 group_length;
unsigned dev;
unsigned unit_off;
};
static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
struct _striping_info *si)
{
u32 stripe_unit = ios->layout->stripe_unit;
u32 group_width = ios->layout->group_width;
u64 group_depth = ios->layout->group_depth;
u32 U = stripe_unit * group_width;
u64 T = U * group_depth;
u64 S = T * ios->layout->group_count;
u64 M = div64_u64(file_offset, S);
/*
G = (L - (M * S)) / T
H = (L - (M * S)) % T
*/
u64 LmodU = file_offset - M * S;
u32 G = div64_u64(LmodU, T);
u64 H = LmodU - G * T;
u32 N = div_u64(H, U);
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
si->dev *= ios->layout->mirrors_p1;
si->group_length = T - H;
}
static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
gfp_t gfp_flags)
{
unsigned pg = *cur_pg;
struct request_queue *q =
osd_request_queue(_io_od(ios, per_dev->dev));
per_dev->length += cur_len;
if (per_dev->bio == NULL) {
unsigned stripes = ios->layout->num_comps /
ios->layout->mirrors_p1;
unsigned pages_in_stripe = stripes *
(ios->layout->stripe_unit / PAGE_SIZE);
unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
stripes;
if (BIO_MAX_PAGES_KMALLOC < bio_size)
bio_size = BIO_MAX_PAGES_KMALLOC;
per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
if (unlikely(!per_dev->bio)) {
dprintk("Faild to allocate BIO size=%u\n", bio_size);
return -ENOMEM;
}
}
while (cur_len > 0) {
unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
unsigned added_len;
BUG_ON(ios->ol_state.nr_pages <= pg);
cur_len -= pglen;
added_len = bio_add_pc_page(q, per_dev->bio,
ios->ol_state.pages[pg], pglen, pgbase);
if (unlikely(pglen != added_len))
return -ENOMEM;
pgbase = 0;
++pg;
}
BUG_ON(cur_len);
*cur_pg = pg;
return 0;
}
static int _prepare_one_group(struct objio_state *ios, u64 length,
struct _striping_info *si, unsigned *last_pg,
gfp_t gfp_flags)
{
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
unsigned cur_pg = *last_pg;
int ret = 0;
while (length) {
struct _objio_per_comp *per_dev = &ios->per_dev[dev];
unsigned cur_len, page_off = 0;
if (!per_dev->length) {
per_dev->dev = dev;
if (dev < si->dev) {
per_dev->offset = si->obj_offset + stripe_unit -
si->unit_off;
cur_len = stripe_unit;
} else if (dev == si->dev) {
per_dev->offset = si->obj_offset;
cur_len = stripe_unit - si->unit_off;
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off &&
(page_off != ios->ol_state.pgbase));
} else { /* dev > si->dev */
per_dev->offset = si->obj_offset - si->unit_off;
cur_len = stripe_unit;
}
if (max_comp < dev)
max_comp = dev;
} else {
cur_len = stripe_unit;
}
if (cur_len >= length)
cur_len = length;
ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
cur_len, gfp_flags);
if (unlikely(ret))
goto out;
dev += mirrors_p1;
dev = (dev % devs_in_group) + first_dev;
length -= cur_len;
ios->length += cur_len;
}
out:
ios->numdevs = max_comp + mirrors_p1;
*last_pg = cur_pg;
return ret;
}
static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
{
u64 length = ios->ol_state.count;
u64 offset = ios->ol_state.offset;
struct _striping_info si;
unsigned last_pg = 0;
int ret = 0;
while (length) {
_calc_stripe_info(ios, offset, &si);
if (length < si.group_length)
si.group_length = length;
ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
if (unlikely(ret))
goto out;
offset += si.group_length;
length -= si.group_length;
}
out:
if (!ios->length)
return ret;
return 0;
}
static ssize_t _sync_done(struct objio_state *ios)
{
struct completion *waiting = ios->private;
complete(waiting);
return 0;
}
static void _last_io(struct kref *kref)
{
struct objio_state *ios = container_of(kref, struct objio_state, kref);
ios->done(ios);
}
static void _done_io(struct osd_request *or, void *p)
{
struct objio_state *ios = p;
kref_put(&ios->kref, _last_io);
}
static ssize_t _io_exec(struct objio_state *ios)
{
DECLARE_COMPLETION_ONSTACK(wait);
ssize_t status = 0; /* sync status */
unsigned i;
objio_done_fn saved_done_fn = ios->done;
bool sync = ios->ol_state.sync;
if (sync) {
ios->done = _sync_done;
ios->private = &wait;
}
kref_init(&ios->kref);
for (i = 0; i < ios->numdevs; i++) {
struct osd_request *or = ios->per_dev[i].or;
if (!or)
continue;
kref_get(&ios->kref);
osd_execute_request_async(or, _done_io, ios);
}
kref_put(&ios->kref, _last_io);
if (sync) {
wait_for_completion(&wait);
status = saved_done_fn(ios);
}
return status;
}
/*
* read
*/
static ssize_t _read_done(struct objio_state *ios)
{
ssize_t status;
int ret = _io_check(ios, false);
_io_free(ios);
if (likely(!ret))
status = ios->length;
else
status = ret;
objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
return status;
}
static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
{
struct osd_request *or = NULL;
struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
unsigned dev = per_dev->dev;
struct pnfs_osd_object_cred *cred =
&ios->layout->comps[dev];
struct osd_obj_id obj = {
.partition = cred->oc_object_id.oid_partition_id,
.id = cred->oc_object_id.oid_object_id,
};
int ret;
or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
if (unlikely(!or)) {
ret = -ENOMEM;
goto err;
}
per_dev->or = or;
osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
if (ret) {
dprintk("%s: Faild to osd_finalize_request() => %d\n",
__func__, ret);
goto err;
}
dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
per_dev->length);
err:
return ret;
}
static ssize_t _read_exec(struct objio_state *ios)
{
unsigned i;
int ret;
for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
if (!ios->per_dev[i].length)
continue;
ret = _read_mirrors(ios, i);
if (unlikely(ret))
goto err;
}
ios->done = _read_done;
return _io_exec(ios); /* In sync mode exec returns the io status */
err:
_io_free(ios);
return ret;
}
ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
int ret;
ret = _io_rw_pagelist(ios, GFP_KERNEL);
if (unlikely(ret))
return ret;
return _read_exec(ios);
}
/*
* write
*/
static ssize_t _write_done(struct objio_state *ios)
{
ssize_t status;
int ret = _io_check(ios, true);
_io_free(ios);
if (likely(!ret)) {
/* FIXME: should be based on the OSD's persistence model
* See OSD2r05 Section 4.13 Data persistence model */
ios->ol_state.committed = NFS_FILE_SYNC;
status = ios->length;
} else {
status = ret;
}
objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
return status;
}
static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
{
struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
unsigned dev = ios->per_dev[cur_comp].dev;
unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
int ret;
for (; cur_comp < last_comp; ++cur_comp, ++dev) {
struct osd_request *or = NULL;
struct pnfs_osd_object_cred *cred =
&ios->layout->comps[dev];
struct osd_obj_id obj = {
.partition = cred->oc_object_id.oid_partition_id,
.id = cred->oc_object_id.oid_object_id,
};
struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
struct bio *bio;
or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
if (unlikely(!or)) {
ret = -ENOMEM;
goto err;
}
per_dev->or = or;
if (per_dev != master_dev) {
bio = bio_kmalloc(GFP_NOFS,
master_dev->bio->bi_max_vecs);
if (unlikely(!bio)) {
dprintk("Faild to allocate BIO size=%u\n",
master_dev->bio->bi_max_vecs);
ret = -ENOMEM;
goto err;
}
__bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
per_dev->bio = bio;
per_dev->dev = dev;
per_dev->length = master_dev->length;
per_dev->offset = master_dev->offset;
} else {
bio = master_dev->bio;
bio->bi_rw |= REQ_WRITE;
}
osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
if (ret) {
dprintk("%s: Faild to osd_finalize_request() => %d\n",
__func__, ret);
goto err;
}
dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
per_dev->length);
}
err:
return ret;
}
static ssize_t _write_exec(struct objio_state *ios)
{
unsigned i;
int ret;
for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
if (!ios->per_dev[i].length)
continue;
ret = _write_mirrors(ios, i);
if (unlikely(ret))
goto err;
}
ios->done = _write_done;
return _io_exec(ios); /* In sync mode exec returns the io->status */
err:
_io_free(ios);
return ret;
}
ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
int ret;
/* TODO: ios->stable = stable; */
ret = _io_rw_pagelist(ios, GFP_NOFS);
if (unlikely(ret))
return ret;
return _write_exec(ios);
}
static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
if (!pnfs_generic_pg_test(pgio, prev, req))
return false;
return pgio->pg_count + req->wb_bytes <=
OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
}
static struct pnfs_layoutdriver_type objlayout_type = {
.id = LAYOUT_OSD2_OBJECTS,
.name = "LAYOUT_OSD2_OBJECTS",
.flags = PNFS_LAYOUTRET_ON_SETATTR,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
.alloc_lseg = objlayout_alloc_lseg,
.free_lseg = objlayout_free_lseg,
.read_pagelist = objlayout_read_pagelist,
.write_pagelist = objlayout_write_pagelist,
.pg_test = objio_pg_test,
.free_deviceid_node = objio_free_deviceid_node,
.encode_layoutcommit = objlayout_encode_layoutcommit,
.encode_layoutreturn = objlayout_encode_layoutreturn,
};
MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
MODULE_LICENSE("GPL");
static int __init
objlayout_init(void)
{
int ret = pnfs_register_layoutdriver(&objlayout_type);
if (ret)
printk(KERN_INFO
"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
__func__, ret);
else
printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
__func__);
return ret;
}
static void __exit
objlayout_exit(void)
{
pnfs_unregister_layoutdriver(&objlayout_type);
printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
__func__);
}
module_init(objlayout_init);
module_exit(objlayout_exit);
/*
* pNFS Objects layout driver high level definitions
*
* Copyright (C) 2007 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <scsi/osd_initiator.h>
#include "objlayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
* Create a objlayout layout structure for the given inode and return it.
*/
struct pnfs_layout_hdr *
objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
struct objlayout *objlay;
objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
if (objlay) {
spin_lock_init(&objlay->lock);
INIT_LIST_HEAD(&objlay->err_list);
}
dprintk("%s: Return %p\n", __func__, objlay);
return &objlay->pnfs_layout;
}
/*
* Free an objlayout layout structure
*/
void
objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct objlayout *objlay = OBJLAYOUT(lo);
dprintk("%s: objlay %p\n", __func__, objlay);
WARN_ON(!list_empty(&objlay->err_list));
kfree(objlay);
}
/*
* Unmarshall layout and store it in pnfslay.
*/
struct pnfs_layout_segment *
objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
struct nfs4_layoutget_res *lgr,
gfp_t gfp_flags)
{
int status = -ENOMEM;
struct xdr_stream stream;
struct xdr_buf buf = {
.pages = lgr->layoutp->pages,
.page_len = lgr->layoutp->len,
.buflen = lgr->layoutp->len,
.len = lgr->layoutp->len,
};
struct page *scratch;
struct pnfs_layout_segment *lseg;
dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
scratch = alloc_page(gfp_flags);
if (!scratch)
goto err_nofree;
xdr_init_decode(&stream, &buf, NULL);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
if (unlikely(status)) {
dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
status);
goto err;
}
__free_page(scratch);
dprintk("%s: Return %p\n", __func__, lseg);
return lseg;
err:
__free_page(scratch);
err_nofree:
dprintk("%s: Err Return=>%d\n", __func__, status);
return ERR_PTR(status);
}
/*
* Free a layout segement
*/
void
objlayout_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s: freeing layout segment %p\n", __func__, lseg);
if (unlikely(!lseg))
return;
objio_free_lseg(lseg);
}
/*
* I/O Operations
*/
static inline u64
end_offset(u64 start, u64 len)
{
u64 end;
end = start + len;
return end >= start ? end : NFS4_MAX_UINT64;
}
/* last octet in a range */
static inline u64
last_byte_offset(u64 start, u64 len)
{
u64 end;
BUG_ON(!len);
end = start + len;
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
static struct objlayout_io_state *
objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
struct page **pages,
unsigned pgbase,
loff_t offset,
size_t count,
struct pnfs_layout_segment *lseg,
void *rpcdata,
gfp_t gfp_flags)
{
struct objlayout_io_state *state;
u64 lseg_end_offset;
dprintk("%s: allocating io_state\n", __func__);
if (objio_alloc_io_state(lseg, &state, gfp_flags))
return NULL;
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
if (offset + count > lseg_end_offset) {
count = lseg->pls_range.length -
(offset - lseg->pls_range.offset);
dprintk("%s: truncated count %Zd\n", __func__, count);
}
if (pgbase > PAGE_SIZE) {
pages += pgbase >> PAGE_SHIFT;
pgbase &= ~PAGE_MASK;
}
INIT_LIST_HEAD(&state->err_list);
state->lseg = lseg;
state->rpcdata = rpcdata;
state->pages = pages;
state->pgbase = pgbase;
state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
state->offset = offset;
state->count = count;
state->sync = 0;
return state;
}
static void
objlayout_free_io_state(struct objlayout_io_state *state)
{
dprintk("%s: freeing io_state\n", __func__);
if (unlikely(!state))
return;
objio_free_io_state(state);
}
/*
* I/O done common code
*/
static void
objlayout_iodone(struct objlayout_io_state *state)
{
dprintk("%s: state %p status\n", __func__, state);
if (likely(state->status >= 0)) {
objlayout_free_io_state(state);
} else {
struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
list_add(&objlay->err_list, &state->err_list);
spin_unlock(&objlay->lock);
}
}
/*
* objlayout_io_set_result - Set an osd_error code on a specific osd comp.
*
* The @index component IO failed (error returned from target). Register
* the error for later reporting at layout-return.
*/
void
objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
struct pnfs_osd_objid *pooid, int osd_error,
u64 offset, u64 length, bool is_write)
{
struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
BUG_ON(index >= state->num_comps);
if (osd_error) {
ioerr->oer_component = *pooid;
ioerr->oer_comp_offset = offset;
ioerr->oer_comp_length = length;
ioerr->oer_iswrite = is_write;
ioerr->oer_errno = osd_error;
dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
__func__, index, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
} else {
/* User need not call if no error is reported */
ioerr->oer_errno = 0;
}
}
/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_read_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
rdata = container_of(task, struct nfs_read_data, task);
pnfs_ld_read_done(rdata);
}
void
objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
{
int eof = state->eof;
struct nfs_read_data *rdata;
state->status = status;
dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
rdata = state->rpcdata;
rdata->task.tk_status = status;
if (status >= 0) {
rdata->res.count = status;
rdata->res.eof = eof;
}
objlayout_iodone(state);
/* must not use state after this point */
if (sync)
pnfs_ld_read_done(rdata);
else {
INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
schedule_work(&rdata->task.u.tk_work);
}
}
/*
* Perform sync or async reads.
*/
enum pnfs_try_status
objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
struct objlayout_io_state *state;
ssize_t status = 0;
loff_t eof;
dprintk("%s: Begin inode %p offset %llu count %d\n",
__func__, rdata->inode, offset, (int)count);
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
status = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
goto out;
}
count = eof - offset;
}
state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
rdata->args.pages, rdata->args.pgbase,
offset, count,
rdata->lseg, rdata,
GFP_KERNEL);
if (unlikely(!state)) {
status = -ENOMEM;
goto out;
}
state->eof = state->offset + state->count >= eof;
status = objio_read_pagelist(state);
out:
dprintk("%s: Return status %Zd\n", __func__, status);
rdata->pnfs_error = status;
return PNFS_ATTEMPTED;
}
/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
* This is because the osd completion is called with ints-off from
* the block layer
*/
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_write_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
pnfs_ld_write_done(wdata);
}
void
objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
bool sync)
{
struct nfs_write_data *wdata;
dprintk("%s: Begin\n", __func__);
wdata = state->rpcdata;
state->status = status;
wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = state->committed;
dprintk("%s: Return status %d committed %d\n",
__func__, wdata->task.tk_status,
wdata->verf.committed);
} else
dprintk("%s: Return status %d\n",
__func__, wdata->task.tk_status);
objlayout_iodone(state);
/* must not use state after this point */
if (sync)
pnfs_ld_write_done(wdata);
else {
INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
schedule_work(&wdata->task.u.tk_work);
}
}
/*
* Perform sync or async writes.
*/
enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
struct objlayout_io_state *state;
ssize_t status;
dprintk("%s: Begin inode %p offset %llu count %u\n",
__func__, wdata->inode, wdata->args.offset, wdata->args.count);
state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
wdata->args.pages,
wdata->args.pgbase,
wdata->args.offset,
wdata->args.count,
wdata->lseg, wdata,
GFP_NOFS);
if (unlikely(!state)) {
status = -ENOMEM;
goto out;
}
state->sync = how & FLUSH_SYNC;
status = objio_write_pagelist(state, how & FLUSH_STABLE);
out:
dprintk("%s: Return status %Zd\n", __func__, status);
wdata->pnfs_error = status;
return PNFS_ATTEMPTED;
}
void
objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct pnfs_osd_layoutupdate lou;
__be32 *start;
dprintk("%s: Begin\n", __func__);
spin_lock(&objlay->lock);
lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
lou.dsu_delta = objlay->delta_space_used;
objlay->delta_space_used = 0;
objlay->delta_space_valid = OBJ_DSU_INIT;
lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
spin_unlock(&objlay->lock);
start = xdr_reserve_space(xdr, 4);
BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
lou.dsu_delta, lou.olu_ioerr_flag);
}
static int
err_prio(u32 oer_errno)
{
switch (oer_errno) {
case 0:
return 0;
case PNFS_OSD_ERR_RESOURCE:
return OSD_ERR_PRI_RESOURCE;
case PNFS_OSD_ERR_BAD_CRED:
return OSD_ERR_PRI_BAD_CRED;
case PNFS_OSD_ERR_NO_ACCESS:
return OSD_ERR_PRI_NO_ACCESS;
case PNFS_OSD_ERR_UNREACHABLE:
return OSD_ERR_PRI_UNREACHABLE;
case PNFS_OSD_ERR_NOT_FOUND:
return OSD_ERR_PRI_NOT_FOUND;
case PNFS_OSD_ERR_NO_SPACE:
return OSD_ERR_PRI_NO_SPACE;
default:
WARN_ON(1);
/* fallthrough */
case PNFS_OSD_ERR_EIO:
return OSD_ERR_PRI_EIO;
}
}
static void
merge_ioerr(struct pnfs_osd_ioerr *dest_err,
const struct pnfs_osd_ioerr *src_err)
{
u64 dest_end, src_end;
if (!dest_err->oer_errno) {
*dest_err = *src_err;
/* accumulated device must be blank */
memset(&dest_err->oer_component.oid_device_id, 0,
sizeof(dest_err->oer_component.oid_device_id));
return;
}
if (dest_err->oer_component.oid_partition_id !=
src_err->oer_component.oid_partition_id)
dest_err->oer_component.oid_partition_id = 0;
if (dest_err->oer_component.oid_object_id !=
src_err->oer_component.oid_object_id)
dest_err->oer_component.oid_object_id = 0;
if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
dest_err->oer_comp_offset = src_err->oer_comp_offset;
dest_end = end_offset(dest_err->oer_comp_offset,
dest_err->oer_comp_length);
src_end = end_offset(src_err->oer_comp_offset,
src_err->oer_comp_length);
if (dest_end < src_end)
dest_end = src_end;
dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
(err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
dest_err->oer_errno = src_err->oer_errno;
} else if (src_err->oer_iswrite) {
dest_err->oer_iswrite = true;
dest_err->oer_errno = src_err->oer_errno;
}
}
static void
encode_accumulated_error(struct objlayout *objlay, __be32 *p)
{
struct objlayout_io_state *state, *tmp;
struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
unsigned i;
for (i = 0; i < state->num_comps; i++) {
struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
if (!ioerr->oer_errno)
continue;
printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
"dev(%llx:%llx) par=0x%llx obj=0x%llx "
"offset=0x%llx length=0x%llx\n",
__func__, i, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
merge_ioerr(&accumulated_err, ioerr);
}
list_del(&state->err_list);
objlayout_free_io_state(state);
}
pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
}
void
objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct objlayout_io_state *state, *tmp;
__be32 *start;
dprintk("%s: Begin\n", __func__);
start = xdr_reserve_space(xdr, 4);
BUG_ON(!start);
spin_lock(&objlay->lock);
list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
__be32 *last_xdr = NULL, *p;
unsigned i;
int res = 0;
for (i = 0; i < state->num_comps; i++) {
struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
if (!ioerr->oer_errno)
continue;
dprintk("%s: err[%d]: errno=%d is_write=%d "
"dev(%llx:%llx) par=0x%llx obj=0x%llx "
"offset=0x%llx length=0x%llx\n",
__func__, i, ioerr->oer_errno,
ioerr->oer_iswrite,
_DEVID_LO(&ioerr->oer_component.oid_device_id),
_DEVID_HI(&ioerr->oer_component.oid_device_id),
ioerr->oer_component.oid_partition_id,
ioerr->oer_component.oid_object_id,
ioerr->oer_comp_offset,
ioerr->oer_comp_length);
p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
if (unlikely(!p)) {
res = -E2BIG;
break; /* accumulated_error */
}
last_xdr = p;
pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
}
/* TODO: use xdr_write_pages */
if (unlikely(res)) {
/* no space for even one error descriptor */
BUG_ON(!last_xdr);
/* we've encountered a situation with lots and lots of
* errors and no space to encode them all. Use the last
* available slot to report the union of all the
* remaining errors.
*/
encode_accumulated_error(objlay, last_xdr);
goto loop_done;
}
list_del(&state->err_list);
objlayout_free_io_state(state);
}
loop_done:
spin_unlock(&objlay->lock);
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
/*
* Get Device Info API for io engines
*/
struct objlayout_deviceinfo {
struct page *page;
struct pnfs_osd_deviceaddr da; /* This must be last */
};
/* Initialize and call nfs_getdeviceinfo, then decode and return a
* "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
* should be called.
*/
int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
gfp_t gfp_flags)
{
struct objlayout_deviceinfo *odi;
struct pnfs_device pd;
struct super_block *sb;
struct page *page, **pages;
u32 *p;
int err;
page = alloc_page(gfp_flags);
if (!page)
return -ENOMEM;
pages = &page;
pd.pages = pages;
memcpy(&pd.dev_id, d_id, sizeof(*d_id));
pd.layout_type = LAYOUT_OSD2_OBJECTS;
pd.pages = &page;
pd.pgbase = 0;
pd.pglen = PAGE_SIZE;
pd.mincount = 0;
sb = pnfslay->plh_inode->i_sb;
err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
if (err)
goto err_out;
p = page_address(page);
odi = kzalloc(sizeof(*odi), gfp_flags);
if (!odi) {
err = -ENOMEM;
goto err_out;
}
pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
odi->page = page;
*deviceaddr = &odi->da;
return 0;
err_out:
__free_page(page);
return err;
}
void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
{
struct objlayout_deviceinfo *odi = container_of(deviceaddr,
struct objlayout_deviceinfo,
da);
__free_page(odi->page);
kfree(odi);
}
/*
* Data types and function declerations for interfacing with the
* pNFS standard object layout driver.
*
* Copyright (C) 2007 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _OBJLAYOUT_H
#define _OBJLAYOUT_H
#include <linux/nfs_fs.h>
#include <linux/pnfs_osd_xdr.h>
#include "../pnfs.h"
/*
* per-inode layout
*/
struct objlayout {
struct pnfs_layout_hdr pnfs_layout;
/* for layout_commit */
enum osd_delta_space_valid_enum {
OBJ_DSU_INIT = 0,
OBJ_DSU_VALID,
OBJ_DSU_INVALID,
} delta_space_valid;
s64 delta_space_used; /* consumed by write ops */
/* for layout_return */
spinlock_t lock;
struct list_head err_list;
};
static inline struct objlayout *
OBJLAYOUT(struct pnfs_layout_hdr *lo)
{
return container_of(lo, struct objlayout, pnfs_layout);
}
/*
* per-I/O operation state
* embedded in objects provider io_state data structure
*/
struct objlayout_io_state {
struct pnfs_layout_segment *lseg;
struct page **pages;
unsigned pgbase;
unsigned nr_pages;
unsigned long count;
loff_t offset;
bool sync;
void *rpcdata;
int status; /* res */
int eof; /* res */
int committed; /* res */
/* Error reporting (layout_return) */
struct list_head err_list;
unsigned num_comps;
/* Pointer to array of error descriptors of size num_comps.
* It should contain as many entries as devices in the osd_layout
* that participate in the I/O. It is up to the io_engine to allocate
* needed space and set num_comps.
*/
struct pnfs_osd_ioerr *ioerrs;
};
/*
* Raid engine I/O API
*/
extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_range *range,
struct xdr_stream *xdr,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
extern int objio_alloc_io_state(
struct pnfs_layout_segment *lseg,
struct objlayout_io_state **outp,
gfp_t gfp_flags);
extern void objio_free_io_state(struct objlayout_io_state *state);
extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
bool stable);
/*
* callback API
*/
extern void objlayout_io_set_result(struct objlayout_io_state *state,
unsigned index, struct pnfs_osd_objid *pooid,
int osd_error, u64 offset, u64 length, bool is_write);
static inline void
objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
{
struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
/* If one of the I/Os errored out and the delta_space_used was
* invalid we render the complete report as invalid. Protocol mandate
* the DSU be accurate or not reported.
*/
spin_lock(&objlay->lock);
if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
objlay->delta_space_valid = OBJ_DSU_VALID;
objlay->delta_space_used += space_used;
}
spin_unlock(&objlay->lock);
}
extern void objlayout_read_done(struct objlayout_io_state *state,
ssize_t status, bool sync);
extern void objlayout_write_done(struct objlayout_io_state *state,
ssize_t status, bool sync);
extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
gfp_t gfp_flags);
extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
/*
* exported generic objects function vectors
*/
extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
extern struct pnfs_layout_segment *objlayout_alloc_lseg(
struct pnfs_layout_hdr *,
struct nfs4_layoutget_res *,
gfp_t gfp_flags);
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
struct nfs_read_data *);
extern enum pnfs_try_status objlayout_write_pagelist(
struct nfs_write_data *,
int how);
extern void objlayout_encode_layoutcommit(
struct pnfs_layout_hdr *,
struct xdr_stream *,
const struct nfs4_layoutcommit_args *);
extern void objlayout_encode_layoutreturn(
struct pnfs_layout_hdr *,
struct xdr_stream *,
const struct nfs4_layoutreturn_args *);
#endif /* _OBJLAYOUT_H */
/*
* Object-Based pNFS Layout XDR layer
*
* Copyright (C) 2007 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/pnfs_osd_xdr.h>
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
* The following implementation is based on RFC5664
*/
/*
* struct pnfs_osd_objid {
* struct nfs4_deviceid oid_device_id;
* u64 oid_partition_id;
* u64 oid_object_id;
* }; // xdr size 32 bytes
*/
static __be32 *
_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
{
p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
sizeof(objid->oid_device_id.data));
p = xdr_decode_hyper(p, &objid->oid_partition_id);
p = xdr_decode_hyper(p, &objid->oid_object_id);
return p;
}
/*
* struct pnfs_osd_opaque_cred {
* u32 cred_len;
* void *cred;
* }; // xdr size [variable]
* The return pointers are from the xdr buffer
*/
static int
_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
struct xdr_stream *xdr)
{
__be32 *p = xdr_inline_decode(xdr, 1);
if (!p)
return -EINVAL;
opaque_cred->cred_len = be32_to_cpu(*p++);
p = xdr_inline_decode(xdr, opaque_cred->cred_len);
if (!p)
return -EINVAL;
opaque_cred->cred = p;
return 0;
}
/*
* struct pnfs_osd_object_cred {
* struct pnfs_osd_objid oc_object_id;
* u32 oc_osd_version;
* u32 oc_cap_key_sec;
* struct pnfs_osd_opaque_cred oc_cap_key
* struct pnfs_osd_opaque_cred oc_cap;
* }; // xdr size 32 + 4 + 4 + [variable] + [variable]
*/
static int
_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
struct xdr_stream *xdr)
{
__be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
int ret;
if (!p)
return -EIO;
p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
comp->oc_osd_version = be32_to_cpup(p++);
comp->oc_cap_key_sec = be32_to_cpup(p);
ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
if (unlikely(ret))
return ret;
ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
return ret;
}
/*
* struct pnfs_osd_data_map {
* u32 odm_num_comps;
* u64 odm_stripe_unit;
* u32 odm_group_width;
* u32 odm_group_depth;
* u32 odm_mirror_cnt;
* u32 odm_raid_algorithm;
* }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
*/
static inline int
_osd_data_map_xdr_sz(void)
{
return 4 + 8 + 4 + 4 + 4 + 4;
}
static __be32 *
_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
{
data_map->odm_num_comps = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
data_map->odm_group_width = be32_to_cpup(p++);
data_map->odm_group_depth = be32_to_cpup(p++);
data_map->odm_mirror_cnt = be32_to_cpup(p++);
data_map->odm_raid_algorithm = be32_to_cpup(p++);
dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
__func__,
data_map->odm_num_comps,
(unsigned long long)data_map->odm_stripe_unit,
data_map->odm_group_width,
data_map->odm_group_depth,
data_map->odm_mirror_cnt,
data_map->odm_raid_algorithm);
return p;
}
int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
{
__be32 *p;
memset(iter, 0, sizeof(*iter));
p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
if (unlikely(!p))
return -EINVAL;
p = _osd_xdr_decode_data_map(p, &layout->olo_map);
layout->olo_comps_index = be32_to_cpup(p++);
layout->olo_num_comps = be32_to_cpup(p++);
iter->total_comps = layout->olo_num_comps;
return 0;
}
bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
int *err)
{
BUG_ON(iter->decoded_comps > iter->total_comps);
if (iter->decoded_comps == iter->total_comps)
return false;
*err = _osd_xdr_decode_object_cred(comp, xdr);
if (unlikely(*err)) {
dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
"total_comps=%d\n", __func__, *err,
iter->decoded_comps, iter->total_comps);
return false; /* stop the loop */
}
dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
"key_len=%u cap_len=%u\n",
__func__,
_DEVID_LO(&comp->oc_object_id.oid_device_id),
_DEVID_HI(&comp->oc_object_id.oid_device_id),
comp->oc_object_id.oid_partition_id,
comp->oc_object_id.oid_object_id,
comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
iter->decoded_comps++;
return true;
}
/*
* Get Device Information Decoding
*
* Note: since Device Information is currently done synchronously, all
* variable strings fields are left inside the rpc buffer and are only
* pointed to by the pnfs_osd_deviceaddr members. So the read buffer
* should not be freed while the returned information is in use.
*/
/*
*struct nfs4_string {
* unsigned int len;
* char *data;
*}; // size [variable]
* NOTE: Returned string points to inside the XDR buffer
*/
static __be32 *
__read_u8_opaque(__be32 *p, struct nfs4_string *str)
{
str->len = be32_to_cpup(p++);
str->data = (char *)p;
p += XDR_QUADLEN(str->len);
return p;
}
/*
* struct pnfs_osd_targetid {
* u32 oti_type;
* struct nfs4_string oti_scsi_device_id;
* };// size 4 + [variable]
*/
static __be32 *
__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
{
u32 oti_type;
oti_type = be32_to_cpup(p++);
targetid->oti_type = oti_type;
switch (oti_type) {
case OBJ_TARGET_SCSI_NAME:
case OBJ_TARGET_SCSI_DEVICE_ID:
p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
}
return p;
}
/*
* struct pnfs_osd_net_addr {
* struct nfs4_string r_netid;
* struct nfs4_string r_addr;
* };
*/
static __be32 *
__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
{
p = __read_u8_opaque(p, &netaddr->r_netid);
p = __read_u8_opaque(p, &netaddr->r_addr);
return p;
}
/*
* struct pnfs_osd_targetaddr {
* u32 ota_available;
* struct pnfs_osd_net_addr ota_netaddr;
* };
*/
static __be32 *
__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
{
u32 ota_available;
ota_available = be32_to_cpup(p++);
targetaddr->ota_available = ota_available;
if (ota_available)
p = __read_net_addr(p, &targetaddr->ota_netaddr);
return p;
}
/*
* struct pnfs_osd_deviceaddr {
* struct pnfs_osd_targetid oda_targetid;
* struct pnfs_osd_targetaddr oda_targetaddr;
* u8 oda_lun[8];
* struct nfs4_string oda_systemid;
* struct pnfs_osd_object_cred oda_root_obj_cred;
* struct nfs4_string oda_osdname;
* };
*/
/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
* not have an xdr_stream
*/
static __be32 *
__read_opaque_cred(__be32 *p,
struct pnfs_osd_opaque_cred *opaque_cred)
{
opaque_cred->cred_len = be32_to_cpu(*p++);
opaque_cred->cred = p;
return p + XDR_QUADLEN(opaque_cred->cred_len);
}
static __be32 *
__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
{
p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
comp->oc_osd_version = be32_to_cpup(p++);
comp->oc_cap_key_sec = be32_to_cpup(p++);
p = __read_opaque_cred(p, &comp->oc_cap_key);
p = __read_opaque_cred(p, &comp->oc_cap);
return p;
}
void pnfs_osd_xdr_decode_deviceaddr(
struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
{
p = __read_targetid(p, &deviceaddr->oda_targetid);
p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
sizeof(deviceaddr->oda_lun));
p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
/* libosd likes this terminated in dbg. It's last, so no problems */
deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
}
/*
* struct pnfs_osd_layoutupdate {
* u32 dsu_valid;
* s64 dsu_delta;
* u32 olu_ioerr_flag;
* }; xdr size 4 + 8 + 4
*/
int
pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
struct pnfs_osd_layoutupdate *lou)
{
__be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
if (!p)
return -E2BIG;
*p++ = cpu_to_be32(lou->dsu_valid);
if (lou->dsu_valid)
p = xdr_encode_hyper(p, lou->dsu_delta);
*p++ = cpu_to_be32(lou->olu_ioerr_flag);
return 0;
}
/*
* struct pnfs_osd_objid {
* struct nfs4_deviceid oid_device_id;
* u64 oid_partition_id;
* u64 oid_object_id;
* }; // xdr size 32 bytes
*/
static inline __be32 *
pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
{
p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
sizeof(object_id->oid_device_id.data));
p = xdr_encode_hyper(p, object_id->oid_partition_id);
p = xdr_encode_hyper(p, object_id->oid_object_id);
return p;
}
/*
* struct pnfs_osd_ioerr {
* struct pnfs_osd_objid oer_component;
* u64 oer_comp_offset;
* u64 oer_comp_length;
* u32 oer_iswrite;
* u32 oer_errno;
* }; // xdr size 32 + 24 bytes
*/
void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
{
p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
p = xdr_encode_hyper(p, ioerr->oer_comp_length);
*p++ = cpu_to_be32(ioerr->oer_iswrite);
*p = cpu_to_be32(ioerr->oer_errno);
}
__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
{
__be32 *p;
p = xdr_reserve_space(xdr, 32 + 24);
if (unlikely(!p))
dprintk("%s: out of xdr space\n", __func__);
return p;
}
...@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req) ...@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
} }
static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
{
/*
* FIXME: ideally we should be able to coalesce all requests
* that are not block boundary aligned, but currently this
* is problematic for the case of bsize < PAGE_CACHE_SIZE,
* since nfs_flush_multi and nfs_pagein_multi assume you
* can have only one struct nfs_page.
*/
if (desc->pg_bsize < PAGE_SIZE)
return 0;
return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
}
/** /**
* nfs_pageio_init - initialise a page io descriptor * nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor * @desc: pointer to descriptor
...@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, ...@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_ioflags = io_flags; desc->pg_ioflags = io_flags;
desc->pg_error = 0; desc->pg_error = 0;
desc->pg_lseg = NULL; desc->pg_lseg = NULL;
desc->pg_test = nfs_generic_pg_test;
pnfs_pageio_init(desc, inode);
} }
/** /**
...@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, ...@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
* *
* Return 'true' if this is the case, else return 'false'. * Return 'true' if this is the case, else return 'false'.
*/ */
static int nfs_can_coalesce_requests(struct nfs_page *prev, static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_page *req, struct nfs_page *req,
struct nfs_pageio_descriptor *pgio) struct nfs_pageio_descriptor *pgio)
{ {
if (req->wb_context->cred != prev->wb_context->cred) if (req->wb_context->cred != prev->wb_context->cred)
return 0; return false;
if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
return 0; return false;
if (req->wb_context->state != prev->wb_context->state) if (req->wb_context->state != prev->wb_context->state)
return 0; return false;
if (req->wb_index != (prev->wb_index + 1)) if (req->wb_index != (prev->wb_index + 1))
return 0; return false;
if (req->wb_pgbase != 0) if (req->wb_pgbase != 0)
return 0; return false;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
return 0; return false;
/* return pgio->pg_test(pgio, prev, req);
* Non-whole file layouts need to check that req is inside of
* pgio->pg_lseg.
*/
if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
return 0;
return 1;
} }
/** /**
...@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, ...@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req) struct nfs_page *req)
{ {
size_t newlen = req->wb_bytes;
if (desc->pg_count != 0) { if (desc->pg_count != 0) {
struct nfs_page *prev; struct nfs_page *prev;
/*
* FIXME: ideally we should be able to coalesce all requests
* that are not block boundary aligned, but currently this
* is problematic for the case of bsize < PAGE_CACHE_SIZE,
* since nfs_flush_multi and nfs_pagein_multi assume you
* can have only one struct nfs_page.
*/
if (desc->pg_bsize < PAGE_SIZE)
return 0;
newlen += desc->pg_count;
if (newlen > desc->pg_bsize)
return 0;
prev = nfs_list_entry(desc->pg_list.prev); prev = nfs_list_entry(desc->pg_list.prev);
if (!nfs_can_coalesce_requests(prev, req, desc)) if (!nfs_can_coalesce_requests(prev, req, desc))
return 0; return 0;
} else } else {
desc->pg_base = req->wb_pgbase; desc->pg_base = req->wb_pgbase;
}
nfs_list_remove_request(req); nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list); nfs_list_add_request(req, &desc->pg_list);
desc->pg_count = newlen; desc->pg_count += req->wb_bytes;
return 1; return 1;
} }
......
...@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo) ...@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
atomic_inc(&lo->plh_refcount); atomic_inc(&lo->plh_refcount);
} }
static struct pnfs_layout_hdr *
pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
}
static void
pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
}
static void static void
destroy_layout_hdr(struct pnfs_layout_hdr *lo) destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{ {
dprintk("%s: freeing layout cache %p\n", __func__, lo); dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts)); BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL; NFS_I(lo->plh_inode)->layout = NULL;
kfree(lo); pnfs_free_layout_hdr(lo);
} }
static void static void
...@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg) ...@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
{ {
struct inode *inode = lseg->pls_layout->plh_inode; struct inode *inode = lseg->pls_layout->plh_inode;
BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list); list_del_init(&lseg->pls_list);
if (list_empty(&lseg->pls_layout->plh_segs)) { if (list_empty(&lseg->pls_layout->plh_segs)) {
set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
...@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg) ...@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
} }
EXPORT_SYMBOL_GPL(put_lseg); EXPORT_SYMBOL_GPL(put_lseg);
static inline u64
end_offset(u64 start, u64 len)
{
u64 end;
end = start + len;
return end >= start ? end : NFS4_MAX_UINT64;
}
/* last octet in a range */
static inline u64
last_byte_offset(u64 start, u64 len)
{
u64 end;
BUG_ON(!len);
end = start + len;
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
/*
* is l2 fully contained in l1?
* start1 end1
* [----------------------------------)
* start2 end2
* [----------------)
*/
static inline int
lo_seg_contained(struct pnfs_layout_range *l1,
struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
u64 start2 = l2->offset;
u64 end2 = end_offset(start2, l2->length);
return (start1 <= start2) && (end1 >= end2);
}
/*
* is l1 and l2 intersecting?
* start1 end1
* [----------------------------------)
* start2 end2
* [----------------)
*/
static inline int
lo_seg_intersecting(struct pnfs_layout_range *l1,
struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
u64 start2 = l2->offset;
u64 end2 = end_offset(start2, l2->length);
return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
(end2 == NFS4_MAX_UINT64 || end2 > start1);
}
static bool static bool
should_free_lseg(u32 lseg_iomode, u32 recall_iomode) should_free_lseg(struct pnfs_layout_range *lseg_range,
struct pnfs_layout_range *recall_range)
{ {
return (recall_iomode == IOMODE_ANY || return (recall_range->iomode == IOMODE_ANY ||
lseg_iomode == recall_iomode); lseg_range->iomode == recall_range->iomode) &&
lo_seg_intersecting(lseg_range, recall_range);
} }
/* Returns 1 if lseg is removed from list, 0 otherwise */ /* Returns 1 if lseg is removed from list, 0 otherwise */
...@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, ...@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list, struct list_head *tmp_list,
u32 iomode) struct pnfs_layout_range *recall_range)
{ {
struct pnfs_layout_segment *lseg, *next; struct pnfs_layout_segment *lseg, *next;
int invalid = 0, removed = 0; int invalid = 0, removed = 0;
...@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, ...@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return 0; return 0;
} }
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(lseg->pls_range.iomode, iomode)) { if (!recall_range ||
should_free_lseg(&lseg->pls_range, recall_range)) {
dprintk("%s: freeing lseg %p iomode %d " dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__, "offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
...@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) ...@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
lo = nfsi->layout; lo = nfsi->layout;
if (lo) { if (lo) {
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
} }
spin_unlock(&nfsi->vfs_inode.i_lock); spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list); pnfs_free_lseg_list(&tmp_list);
...@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, ...@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment * static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo, send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx, struct nfs_open_context *ctx,
u32 iomode, struct pnfs_layout_range *range,
gfp_t gfp_flags) gfp_t gfp_flags)
{ {
struct inode *ino = lo->plh_inode; struct inode *ino = lo->plh_inode;
...@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo, ...@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
goto out_err_free; goto out_err_free;
} }
lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.minlength = PAGE_CACHE_SIZE;
if (lgp->args.minlength > range->length)
lgp->args.minlength = range->length;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
lgp->args.range.iomode = iomode; lgp->args.range = *range;
lgp->args.range.offset = 0;
lgp->args.range.length = NFS4_MAX_UINT64;
lgp->args.type = server->pnfs_curr_ld->id; lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino; lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx); lgp->args.ctx = get_nfs_open_context(ctx);
...@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, ...@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp); nfs4_proc_layoutget(lgp);
if (!lseg) { if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */ /* remember that LAYOUTGET failed and suspend trying */
set_bit(lo_fail_bit(iomode), &lo->plh_flags); set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
} }
/* free xdr pages */ /* free xdr pages */
...@@ -542,6 +619,51 @@ send_layoutget(struct pnfs_layout_hdr *lo, ...@@ -542,6 +619,51 @@ send_layoutget(struct pnfs_layout_hdr *lo,
return NULL; return NULL;
} }
/* Initiates a LAYOUTRETURN(FILE) */
int
_pnfs_return_layout(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
struct nfs4_layoutreturn *lrp;
nfs4_stateid stateid;
int status = 0;
dprintk("--> %s\n", __func__);
spin_lock(&ino->i_lock);
lo = nfsi->layout;
if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
spin_unlock(&ino->i_lock);
dprintk("%s: no layout segments to return\n", __func__);
goto out;
}
stateid = nfsi->layout->plh_stateid;
/* Reference matched in nfs4_layoutreturn_release */
get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
goto out;
}
lrp->args.stateid = stateid;
lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
lrp->args.inode = ino;
lrp->clp = NFS_SERVER(ino)->nfs_client;
status = nfs4_proc_layoutreturn(lrp);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
}
bool pnfs_roc(struct inode *ino) bool pnfs_roc(struct inode *ino)
{ {
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
...@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier) ...@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
* are seen first. * are seen first.
*/ */
static s64 static s64
cmp_layout(u32 iomode1, u32 iomode2) cmp_layout(struct pnfs_layout_range *l1,
struct pnfs_layout_range *l2)
{ {
s64 d;
/* high offset > low offset */
d = l1->offset - l2->offset;
if (d)
return d;
/* short length > long length */
d = l2->length - l1->length;
if (d)
return d;
/* read > read/write */ /* read > read/write */
return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
} }
static void static void
...@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, ...@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg) struct pnfs_layout_segment *lseg)
{ {
struct pnfs_layout_segment *lp; struct pnfs_layout_segment *lp;
int found = 0;
dprintk("%s:Begin\n", __func__); dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->plh_inode->i_lock); assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lp, &lo->plh_segs, pls_list) { list_for_each_entry(lp, &lo->plh_segs, pls_list) {
if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
continue; continue;
list_add_tail(&lseg->pls_list, &lp->pls_list); list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p " dprintk("%s: inserted lseg %p "
...@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, ...@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset, lseg->pls_range.length, lseg->pls_range.offset, lseg->pls_range.length,
lp, lp->pls_range.iomode, lp->pls_range.offset, lp, lp->pls_range.iomode, lp->pls_range.offset,
lp->pls_range.length); lp->pls_range.length);
found = 1; goto out;
break;
}
if (!found) {
list_add_tail(&lseg->pls_list, &lo->plh_segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
__func__, lseg, lseg->pls_range.iomode,
lseg->pls_range.offset, lseg->pls_range.length);
} }
list_add_tail(&lseg->pls_list, &lo->plh_segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
__func__, lseg, lseg->pls_range.iomode,
lseg->pls_range.offset, lseg->pls_range.length);
out:
get_layout_hdr(lo); get_layout_hdr(lo);
dprintk("%s:Return\n", __func__); dprintk("%s:Return\n", __func__);
...@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) ...@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
{ {
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
if (!lo) if (!lo)
return NULL; return NULL;
atomic_set(&lo->plh_refcount, 1); atomic_set(&lo->plh_refcount, 1);
...@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) ...@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
if (likely(nfsi->layout == NULL)) /* Won the race? */ if (likely(nfsi->layout == NULL)) /* Won the race? */
nfsi->layout = new; nfsi->layout = new;
else else
kfree(new); pnfs_free_layout_hdr(new);
return nfsi->layout; return nfsi->layout;
} }
...@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) ...@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
* READ RW true * READ RW true
*/ */
static int static int
is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) is_matching_lseg(struct pnfs_layout_range *ls_range,
struct pnfs_layout_range *range)
{ {
return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
!lo_seg_intersecting(ls_range, range))
return 0;
/* range1 covers only the first byte in the range */
range1 = *range;
range1.length = 1;
return lo_seg_contained(ls_range, &range1);
} }
/* /*
* lookup range in layout * lookup range in layout
*/ */
static struct pnfs_layout_segment * static struct pnfs_layout_segment *
pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) pnfs_find_lseg(struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range)
{ {
struct pnfs_layout_segment *lseg, *ret = NULL; struct pnfs_layout_segment *lseg, *ret = NULL;
...@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) ...@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock); assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) { list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
is_matching_lseg(lseg, iomode)) { is_matching_lseg(&lseg->pls_range, range)) {
ret = get_lseg(lseg); ret = get_lseg(lseg);
break; break;
} }
if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) if (cmp_layout(range, &lseg->pls_range) > 0)
break; break;
} }
...@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) ...@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
struct pnfs_layout_segment * struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx, struct nfs_open_context *ctx,
loff_t pos,
u64 count,
enum pnfs_iomode iomode, enum pnfs_iomode iomode,
gfp_t gfp_flags) gfp_t gfp_flags)
{ {
struct pnfs_layout_range arg = {
.iomode = iomode,
.offset = pos,
.length = count,
};
unsigned pg_offset;
struct nfs_inode *nfsi = NFS_I(ino); struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
...@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino, ...@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock; goto out_unlock;
/* Check to see if the layout for the given range already exists */ /* Check to see if the layout for the given range already exists */
lseg = pnfs_find_lseg(lo, iomode); lseg = pnfs_find_lseg(lo, &arg);
if (lseg) if (lseg)
goto out_unlock; goto out_unlock;
...@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino, ...@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&clp->cl_lock); spin_unlock(&clp->cl_lock);
} }
lseg = send_layoutget(lo, ctx, iomode, gfp_flags); pg_offset = arg.offset & ~PAGE_CACHE_MASK;
if (pg_offset) {
arg.offset -= pg_offset;
arg.length += pg_offset;
}
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
if (!lseg && first) { if (!lseg && first) {
spin_lock(&clp->cl_lock); spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts); list_del_init(&lo->plh_layouts);
...@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) ...@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0; int status = 0;
/* Verify we got what we asked for.
* Note that because the xdr parsing only accepts a single
* element array, this can fail even if the server is behaving
* correctly.
*/
if (lgp->args.range.iomode > res->range.iomode ||
res->range.offset != 0 ||
res->range.length != NFS4_MAX_UINT64) {
status = -EINVAL;
goto out;
}
/* Inject layout blob into I/O device driver */ /* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
if (!lseg || IS_ERR(lseg)) { if (!lseg || IS_ERR(lseg)) {
...@@ -895,51 +1043,64 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) ...@@ -895,51 +1043,64 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out; goto out;
} }
static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, bool
struct nfs_page *prev, pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req) struct nfs_page *req)
{ {
enum pnfs_iomode access_type;
gfp_t gfp_flags;
/* We assume that pg_ioflags == 0 iff we're reading a page */
if (pgio->pg_ioflags == 0) {
access_type = IOMODE_READ;
gfp_flags = GFP_KERNEL;
} else {
access_type = IOMODE_RW;
gfp_flags = GFP_NOFS;
}
if (pgio->pg_count == prev->wb_bytes) { if (pgio->pg_count == prev->wb_bytes) {
/* This is first coelesce call for a series of nfs_pages */ /* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context, prev->wb_context,
IOMODE_READ, req_offset(req),
GFP_KERNEL); pgio->pg_count,
access_type,
gfp_flags);
return true;
} }
return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}
void if (pgio->pg_lseg &&
pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
{ pgio->pg_lseg->pls_range.length))
struct pnfs_layoutdriver_type *ld; return false;
ld = NFS_SERVER(inode)->pnfs_curr_ld; return true;
pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
} }
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, /*
struct nfs_page *prev, * Called by non rpc-based layout drivers
struct nfs_page *req) */
int
pnfs_ld_write_done(struct nfs_write_data *data)
{ {
if (pgio->pg_count == prev->wb_bytes) { int status;
/* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context,
IOMODE_RW,
GFP_NOFS);
}
return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}
void if (!data->pnfs_error) {
pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) pnfs_set_layoutcommit(data);
{ data->mds_ops->rpc_call_done(&data->task, data);
struct pnfs_layoutdriver_type *ld; data->mds_ops->rpc_release(data);
return 0;
}
ld = NFS_SERVER(inode)->pnfs_curr_ld; dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; data->pnfs_error);
status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
data->mds_ops, NFS_FILE_SYNC);
return status ? : -EAGAIN;
} }
EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
enum pnfs_try_status enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_write_data *wdata, pnfs_try_to_write_data(struct nfs_write_data *wdata,
...@@ -965,6 +1126,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, ...@@ -965,6 +1126,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
return trypnfs; return trypnfs;
} }
/*
* Called by non rpc-based layout drivers
*/
int
pnfs_ld_read_done(struct nfs_read_data *data)
{
int status;
if (!data->pnfs_error) {
__nfs4_read_done_cb(data);
data->mds_ops->rpc_call_done(&data->task, data);
data->mds_ops->rpc_release(data);
return 0;
}
dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
data->pnfs_error);
status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
data->mds_ops);
return status ? : -EAGAIN;
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
/* /*
* Call the appropriate parallel I/O subsystem read function. * Call the appropriate parallel I/O subsystem read function.
*/ */
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#ifndef FS_NFS_PNFS_H #ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H #define FS_NFS_PNFS_H
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h> #include <linux/nfs_page.h>
enum { enum {
...@@ -64,17 +65,29 @@ enum { ...@@ -64,17 +65,29 @@ enum {
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
}; };
enum layoutdriver_policy_flags {
/* Should the pNFS client commit and return the layout upon a setattr */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
};
struct nfs4_deviceid_node;
/* Per-layout driver specific registration structure */ /* Per-layout driver specific registration structure */
struct pnfs_layoutdriver_type { struct pnfs_layoutdriver_type {
struct list_head pnfs_tblid; struct list_head pnfs_tblid;
const u32 id; const u32 id;
const char *name; const char *name;
struct module *owner; struct module *owner;
unsigned flags;
struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
void (*free_layout_hdr) (struct pnfs_layout_hdr *);
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg); void (*free_lseg) (struct pnfs_layout_segment *lseg);
/* test for nfs page cache coalescing */ /* test for nfs page cache coalescing */
int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
/* Returns true if layoutdriver wants to divert this request to /* Returns true if layoutdriver wants to divert this request to
* driver's commit routine. * driver's commit routine.
...@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type { ...@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
*/ */
enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
}; };
struct pnfs_layout_hdr { struct pnfs_layout_hdr {
...@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ...@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev); struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
/* pnfs.c */ /* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo); void get_layout_hdr(struct pnfs_layout_hdr *lo);
void put_lseg(struct pnfs_layout_segment *lseg); void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment * struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type, gfp_t gfp_flags); loff_t pos, u64 count, enum pnfs_iomode access_type,
gfp_t gfp_flags);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *); void unset_pnfs_layoutdriver(struct nfs_server *);
enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
const struct rpc_call_ops *, int); const struct rpc_call_ops *, int);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *); const struct rpc_call_ops *);
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp); int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_layout(struct nfs_inode *);
...@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, ...@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct nfs4_state *open_state); struct nfs4_state *open_state);
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list, struct list_head *tmp_list,
u32 iomode); struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino); bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino); void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_ld_write_done(struct nfs_write_data *);
int pnfs_ld_read_done(struct nfs_read_data *);
/* pnfs_dev.c */
struct nfs4_deviceid_node {
struct hlist_node node;
const struct pnfs_layoutdriver_type *ld;
const struct nfs_client *nfs_client;
struct nfs4_deviceid deviceid;
atomic_t ref;
};
void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
const struct pnfs_layoutdriver_type *,
const struct nfs_client *,
const struct nfs4_deviceid *);
struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
void nfs4_deviceid_purge_client(const struct nfs_client *);
static inline int lo_fail_bit(u32 iomode) static inline int lo_fail_bit(u32 iomode)
{ {
...@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req) ...@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
put_lseg(req->wb_commit_lseg); put_lseg(req->wb_commit_lseg);
} }
/* Should the pNFS client commit and return the layout upon a setattr */
static inline bool
pnfs_ld_layoutret_on_setattr(struct inode *inode)
{
if (!pnfs_enabled_sb(NFS_SERVER(inode)))
return false;
return NFS_SERVER(inode)->pnfs_curr_ld->flags &
PNFS_LAYOUTRET_ON_SETATTR;
}
static inline int pnfs_return_layout(struct inode *ino)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_server *nfss = NFS_SERVER(ino);
if (pnfs_enabled_sb(nfss) && nfsi->layout)
return _pnfs_return_layout(ino);
return 0;
}
static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
struct inode *inode)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
if (ld)
pgio->pg_test = ld->pg_test;
}
#else /* CONFIG_NFS_V4_1 */ #else /* CONFIG_NFS_V4_1 */
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
...@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) ...@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
static inline struct pnfs_layout_segment * static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type, gfp_t gfp_flags) loff_t pos, u64 count, enum pnfs_iomode access_type,
gfp_t gfp_flags)
{ {
return NULL; return NULL;
} }
...@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data, ...@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
return PNFS_NOT_ATTEMPTED; return PNFS_NOT_ATTEMPTED;
} }
static inline int pnfs_return_layout(struct inode *ino)
{
return 0;
}
static inline bool
pnfs_ld_layoutret_on_setattr(struct inode *inode)
{
return false;
}
static inline bool static inline bool
pnfs_roc(struct inode *ino) pnfs_roc(struct inode *ino)
{ {
...@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) ...@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{ {
} }
static inline void static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) struct inode *inode)
{
pgio->pg_test = NULL;
}
static inline void
pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
{ {
pgio->pg_test = NULL;
} }
static inline void static inline void
...@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) ...@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{ {
return 0; return 0;
} }
static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
{
}
#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */ #endif /* FS_NFS_PNFS_H */
/*
* Device operations for the pnfs client.
*
* Copyright (c) 2002
* The Regents of the University of Michigan
* All Rights Reserved
*
* Dean Hildebrand <dhildebz@umich.edu>
* Garth Goodson <Garth.Goodson@netapp.com>
*
* Permission is granted to use, copy, create derivative works, and
* redistribute this software and such derivative works for any purpose,
* so long as the name of the University of Michigan is not used in
* any advertising or publicity pertaining to the use or distribution
* of this software without specific, written prior authorization. If
* the above copyright notice or any other identification of the
* University of Michigan is included in any copy of any portion of
* this software, then the disclaimer below must also be included.
*
* This software is provided as is, without representation or warranty
* of any kind either express or implied, including without limitation
* the implied warranties of merchantability, fitness for a particular
* purpose, or noninfringement. The Regents of the University of
* Michigan shall not be liable for any damages, including special,
* indirect, incidental, or consequential damages, with respect to any
* claim arising out of or in connection with the use of the software,
* even if it has been or is hereafter advised of the possibility of
* such damages.
*/
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
/*
* Device ID RCU cache. A device ID is unique per server and layout type.
*/
#define NFS4_DEVICE_ID_HASH_BITS 5
#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
static DEFINE_SPINLOCK(nfs4_deviceid_lock);
void
nfs4_print_deviceid(const struct nfs4_deviceid *id)
{
u32 *p = (u32 *)id;
dprintk("%s: device id= [%x%x%x%x]\n", __func__,
p[0], p[1], p[2], p[3]);
}
EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
static inline u32
nfs4_deviceid_hash(const struct nfs4_deviceid *id)
{
unsigned char *cptr = (unsigned char *)id->data;
unsigned int nbytes = NFS4_DEVICEID4_SIZE;
u32 x = 0;
while (nbytes--) {
x *= 37;
x += *cptr++;
}
return x & NFS4_DEVICE_ID_HASH_MASK;
}
static struct nfs4_deviceid_node *
_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id,
long hash)
{
struct nfs4_deviceid_node *d;
struct hlist_node *n;
hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
if (d->ld == ld && d->nfs_client == clp &&
!memcmp(&d->deviceid, id, sizeof(*id))) {
if (atomic_read(&d->ref))
return d;
else
continue;
}
return NULL;
}
/*
* Lookup a deviceid in cache and get a reference count on it if found
*
* @clp nfs_client associated with deviceid
* @id deviceid to look up
*/
struct nfs4_deviceid_node *
_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id,
long hash)
{
struct nfs4_deviceid_node *d;
rcu_read_lock();
d = _lookup_deviceid(ld, clp, id, hash);
if (d && !atomic_inc_not_zero(&d->ref))
d = NULL;
rcu_read_unlock();
return d;
}
struct nfs4_deviceid_node *
nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id)
{
return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
}
EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
/*
* Unhash and put deviceid
*
* @clp nfs_client associated with deviceid
* @id the deviceid to unhash
*
* @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
*/
struct nfs4_deviceid_node *
nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id)
{
struct nfs4_deviceid_node *d;
spin_lock(&nfs4_deviceid_lock);
rcu_read_lock();
d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
rcu_read_unlock();
if (!d) {
spin_unlock(&nfs4_deviceid_lock);
return NULL;
}
hlist_del_init_rcu(&d->node);
spin_unlock(&nfs4_deviceid_lock);
synchronize_rcu();
/* balance the initial ref set in pnfs_insert_deviceid */
if (atomic_dec_and_test(&d->ref))
return d;
return NULL;
}
EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
/*
* Delete a deviceid from cache
*
* @clp struct nfs_client qualifying the deviceid
* @id deviceid to delete
*/
void
nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *clp, const struct nfs4_deviceid *id)
{
struct nfs4_deviceid_node *d;
d = nfs4_unhash_put_deviceid(ld, clp, id);
if (!d)
return;
d->ld->free_deviceid_node(d);
}
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
void
nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
const struct pnfs_layoutdriver_type *ld,
const struct nfs_client *nfs_client,
const struct nfs4_deviceid *id)
{
INIT_HLIST_NODE(&d->node);
d->ld = ld;
d->nfs_client = nfs_client;
d->deviceid = *id;
atomic_set(&d->ref, 1);
}
EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
/*
* Uniquely initialize and insert a deviceid node into cache
*
* @new new deviceid node
* Note that the caller must set up the following members:
* new->ld
* new->nfs_client
* new->deviceid
*
* @ret the inserted node, if none found, otherwise, the found entry.
*/
struct nfs4_deviceid_node *
nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
{
struct nfs4_deviceid_node *d;
long hash;
spin_lock(&nfs4_deviceid_lock);
hash = nfs4_deviceid_hash(&new->deviceid);
d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
if (d) {
spin_unlock(&nfs4_deviceid_lock);
return d;
}
hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
spin_unlock(&nfs4_deviceid_lock);
return new;
}
EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
/*
* Dereference a deviceid node and delete it when its reference count drops
* to zero.
*
* @d deviceid node to put
*
* @ret true iff the node was deleted
*/
bool
nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
{
if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
return false;
hlist_del_init_rcu(&d->node);
spin_unlock(&nfs4_deviceid_lock);
synchronize_rcu();
d->ld->free_deviceid_node(d);
return true;
}
EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
static void
_deviceid_purge_client(const struct nfs_client *clp, long hash)
{
struct nfs4_deviceid_node *d;
struct hlist_node *n, *next;
HLIST_HEAD(tmp);
rcu_read_lock();
hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
if (d->nfs_client == clp && atomic_read(&d->ref)) {
hlist_del_init_rcu(&d->node);
hlist_add_head(&d->node, &tmp);
}
rcu_read_unlock();
if (hlist_empty(&tmp))
return;
synchronize_rcu();
hlist_for_each_entry_safe(d, n, next, &tmp, node)
if (atomic_dec_and_test(&d->ref))
d->ld->free_deviceid_node(d);
}
void
nfs4_deviceid_purge_client(const struct nfs_client *clp)
{
long h;
spin_lock(&nfs4_deviceid_lock);
for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
_deviceid_purge_client(clp, h);
spin_unlock(&nfs4_deviceid_lock);
}
...@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) ...@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests); atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg != NULL); BUG_ON(desc->pg_lseg != NULL);
lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
req_offset(req), desc->pg_count,
IOMODE_READ, GFP_KERNEL);
ClearPageError(page); ClearPageError(page);
offset = 0; offset = 0;
nbytes = desc->pg_count; nbytes = desc->pg_count;
...@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) ...@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
} }
req = nfs_list_entry(data->pages.next); req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages)) if ((!lseg) && list_is_singular(&data->pages))
lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
req_offset(req), desc->pg_count,
IOMODE_READ, GFP_KERNEL);
ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
0, lseg); 0, lseg);
...@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, ...@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0) if (ret == 0)
goto read_complete; /* all pages were read */ goto read_complete; /* all pages were read */
pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE) if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else else
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
#include "iostat.h" #include "iostat.h"
#include "internal.h" #include "internal.h"
#include "fscache.h" #include "fscache.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_VFS #define NFSDBG_FACILITY NFSDBG_VFS
...@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) ...@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0; return 0;
} }
#ifdef CONFIG_NFS_V4_1
void show_sessions(struct seq_file *m, struct nfs_server *server)
{
if (nfs4_has_session(server->nfs_client))
seq_printf(m, ",sessions");
}
#else
void show_sessions(struct seq_file *m, struct nfs_server *server) {}
#endif
#ifdef CONFIG_NFS_V4_1
void show_pnfs(struct seq_file *m, struct nfs_server *server)
{
seq_printf(m, ",pnfs=");
if (server->pnfs_curr_ld)
seq_printf(m, "%s", server->pnfs_curr_ld->name);
else
seq_printf(m, "not configured");
}
#else /* CONFIG_NFS_V4_1 */
void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
#endif /* CONFIG_NFS_V4_1 */
static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
{ {
...@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) ...@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
show_sessions(m, nfss);
show_pnfs(m, nfss);
} }
#endif #endif
......
...@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) ...@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests); atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg); BUG_ON(desc->pg_lseg);
lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
req_offset(req), desc->pg_count,
IOMODE_RW, GFP_NOFS);
ClearPageError(page); ClearPageError(page);
offset = 0; offset = 0;
nbytes = desc->pg_count; nbytes = desc->pg_count;
...@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) ...@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
} }
req = nfs_list_entry(data->pages.next); req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages)) if ((!lseg) && list_is_singular(&data->pages))
lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
req_offset(req), desc->pg_count,
IOMODE_RW, GFP_NOFS);
if ((desc->pg_ioflags & FLUSH_COND_STABLE) && if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
...@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, ...@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{ {
size_t wsize = NFS_SERVER(inode)->wsize; size_t wsize = NFS_SERVER(inode)->wsize;
pnfs_pageio_init_write(pgio, inode);
if (wsize < PAGE_CACHE_SIZE) if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else else
......
...@@ -562,6 +562,7 @@ enum { ...@@ -562,6 +562,7 @@ enum {
NFSPROC4_CLNT_LAYOUTGET, NFSPROC4_CLNT_LAYOUTGET,
NFSPROC4_CLNT_GETDEVICEINFO, NFSPROC4_CLNT_GETDEVICEINFO,
NFSPROC4_CLNT_LAYOUTCOMMIT, NFSPROC4_CLNT_LAYOUTCOMMIT,
NFSPROC4_CLNT_LAYOUTRETURN,
}; };
/* nfs41 types */ /* nfs41 types */
......
...@@ -68,7 +68,7 @@ struct nfs_pageio_descriptor { ...@@ -68,7 +68,7 @@ struct nfs_pageio_descriptor {
int pg_ioflags; int pg_ioflags;
int pg_error; int pg_error;
struct pnfs_layout_segment *pg_lseg; struct pnfs_layout_segment *pg_lseg;
int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
}; };
#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
......
...@@ -269,6 +269,27 @@ struct nfs4_layoutcommit_data { ...@@ -269,6 +269,27 @@ struct nfs4_layoutcommit_data {
struct nfs4_layoutcommit_res res; struct nfs4_layoutcommit_res res;
}; };
struct nfs4_layoutreturn_args {
__u32 layout_type;
struct inode *inode;
nfs4_stateid stateid;
struct nfs4_sequence_args seq_args;
};
struct nfs4_layoutreturn_res {
struct nfs4_sequence_res seq_res;
u32 lrs_present;
nfs4_stateid stateid;
};
struct nfs4_layoutreturn {
struct nfs4_layoutreturn_args args;
struct nfs4_layoutreturn_res res;
struct rpc_cred *cred;
struct nfs_client *clp;
int rpc_status;
};
/* /*
* Arguments to the open call. * Arguments to the open call.
*/ */
...@@ -1087,6 +1108,7 @@ struct nfs_read_data { ...@@ -1087,6 +1108,7 @@ struct nfs_read_data {
const struct rpc_call_ops *mds_ops; const struct rpc_call_ops *mds_ops;
int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data); int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
__u64 mds_offset; __u64 mds_offset;
int pnfs_error;
struct page *page_array[NFS_PAGEVEC_SIZE]; struct page *page_array[NFS_PAGEVEC_SIZE];
}; };
...@@ -1112,6 +1134,7 @@ struct nfs_write_data { ...@@ -1112,6 +1134,7 @@ struct nfs_write_data {
unsigned long timestamp; /* For lease renewal */ unsigned long timestamp; /* For lease renewal */
#endif #endif
__u64 mds_offset; /* Filelayout dense stripe */ __u64 mds_offset; /* Filelayout dense stripe */
int pnfs_error;
struct page *page_array[NFS_PAGEVEC_SIZE]; struct page *page_array[NFS_PAGEVEC_SIZE];
}; };
......
/*
* pNFS-osd on-the-wire data structures
*
* Copyright (C) 2007 Panasas Inc. [year of first publication]
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
* Boaz Harrosh <bharrosh@panasas.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* See the file COPYING included with this distribution for more details.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Panasas company nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __PNFS_OSD_XDR_H__
#define __PNFS_OSD_XDR_H__
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <scsi/osd_protocol.h>
#define PNFS_OSD_OSDNAME_MAXSIZE 256
/*
* draft-ietf-nfsv4-minorversion-22
* draft-ietf-nfsv4-pnfs-obj-12
*/
/* Layout Structure */
enum pnfs_osd_raid_algorithm4 {
PNFS_OSD_RAID_0 = 1,
PNFS_OSD_RAID_4 = 2,
PNFS_OSD_RAID_5 = 3,
PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
};
/* struct pnfs_osd_data_map4 {
* uint32_t odm_num_comps;
* length4 odm_stripe_unit;
* uint32_t odm_group_width;
* uint32_t odm_group_depth;
* uint32_t odm_mirror_cnt;
* pnfs_osd_raid_algorithm4 odm_raid_algorithm;
* };
*/
struct pnfs_osd_data_map {
u32 odm_num_comps;
u64 odm_stripe_unit;
u32 odm_group_width;
u32 odm_group_depth;
u32 odm_mirror_cnt;
u32 odm_raid_algorithm;
};
/* struct pnfs_osd_objid4 {
* deviceid4 oid_device_id;
* uint64_t oid_partition_id;
* uint64_t oid_object_id;
* };
*/
struct pnfs_osd_objid {
struct nfs4_deviceid oid_device_id;
u64 oid_partition_id;
u64 oid_object_id;
};
/* For printout. I use:
* kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer));
* BE style
*/
#define _DEVID_LO(oid_device_id) \
(unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data)
#define _DEVID_HI(oid_device_id) \
(unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1)
static inline int
pnfs_osd_objid_xdr_sz(void)
{
return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
}
enum pnfs_osd_version {
PNFS_OSD_MISSING = 0,
PNFS_OSD_VERSION_1 = 1,
PNFS_OSD_VERSION_2 = 2
};
struct pnfs_osd_opaque_cred {
u32 cred_len;
void *cred;
};
enum pnfs_osd_cap_key_sec {
PNFS_OSD_CAP_KEY_SEC_NONE = 0,
PNFS_OSD_CAP_KEY_SEC_SSV = 1,
};
/* struct pnfs_osd_object_cred4 {
* pnfs_osd_objid4 oc_object_id;
* pnfs_osd_version4 oc_osd_version;
* pnfs_osd_cap_key_sec4 oc_cap_key_sec;
* opaque oc_capability_key<>;
* opaque oc_capability<>;
* };
*/
struct pnfs_osd_object_cred {
struct pnfs_osd_objid oc_object_id;
u32 oc_osd_version;
u32 oc_cap_key_sec;
struct pnfs_osd_opaque_cred oc_cap_key;
struct pnfs_osd_opaque_cred oc_cap;
};
/* struct pnfs_osd_layout4 {
* pnfs_osd_data_map4 olo_map;
* uint32_t olo_comps_index;
* pnfs_osd_object_cred4 olo_components<>;
* };
*/
struct pnfs_osd_layout {
struct pnfs_osd_data_map olo_map;
u32 olo_comps_index;
u32 olo_num_comps;
struct pnfs_osd_object_cred *olo_comps;
};
/* Device Address */
enum pnfs_osd_targetid_type {
OBJ_TARGET_ANON = 1,
OBJ_TARGET_SCSI_NAME = 2,
OBJ_TARGET_SCSI_DEVICE_ID = 3,
};
/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
* case OBJ_TARGET_SCSI_NAME:
* string oti_scsi_name<>;
*
* case OBJ_TARGET_SCSI_DEVICE_ID:
* opaque oti_scsi_device_id<>;
*
* default:
* void;
* };
*
* union pnfs_osd_targetaddr4 switch (bool ota_available) {
* case TRUE:
* netaddr4 ota_netaddr;
* case FALSE:
* void;
* };
*
* struct pnfs_osd_deviceaddr4 {
* pnfs_osd_targetid4 oda_targetid;
* pnfs_osd_targetaddr4 oda_targetaddr;
* uint64_t oda_lun;
* opaque oda_systemid<>;
* pnfs_osd_object_cred4 oda_root_obj_cred;
* opaque oda_osdname<>;
* };
*/
struct pnfs_osd_targetid {
u32 oti_type;
struct nfs4_string oti_scsi_device_id;
};
enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
/* struct netaddr4 {
* // see struct rpcb in RFC1833
* string r_netid<>; // network id
* string r_addr<>; // universal address
* };
*/
struct pnfs_osd_net_addr {
struct nfs4_string r_netid;
struct nfs4_string r_addr;
};
struct pnfs_osd_targetaddr {
u32 ota_available;
struct pnfs_osd_net_addr ota_netaddr;
};
enum {
NETWORK_ID_MAX = 16 / 4,
UNIVERSAL_ADDRESS_MAX = 64 / 4,
PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
};
struct pnfs_osd_deviceaddr {
struct pnfs_osd_targetid oda_targetid;
struct pnfs_osd_targetaddr oda_targetaddr;
u8 oda_lun[8];
struct nfs4_string oda_systemid;
struct pnfs_osd_object_cred oda_root_obj_cred;
struct nfs4_string oda_osdname;
};
enum {
ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
PNFS_OSD_DEVICEADDR_MAX =
PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
2 /*oda_lun*/ +
1 + OSD_SYSTEMID_LEN +
1 + ODA_OSDNAME_MAX,
};
/* LAYOUTCOMMIT: layoutupdate */
/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
* case TRUE:
* int64_t dsu_delta;
* case FALSE:
* void;
* };
*
* struct pnfs_osd_layoutupdate4 {
* pnfs_osd_deltaspaceused4 olu_delta_space_used;
* bool olu_ioerr_flag;
* };
*/
struct pnfs_osd_layoutupdate {
u32 dsu_valid;
s64 dsu_delta;
u32 olu_ioerr_flag;
};
/* LAYOUTRETURN: I/O Rrror Report */
enum pnfs_osd_errno {
PNFS_OSD_ERR_EIO = 1,
PNFS_OSD_ERR_NOT_FOUND = 2,
PNFS_OSD_ERR_NO_SPACE = 3,
PNFS_OSD_ERR_BAD_CRED = 4,
PNFS_OSD_ERR_NO_ACCESS = 5,
PNFS_OSD_ERR_UNREACHABLE = 6,
PNFS_OSD_ERR_RESOURCE = 7
};
/* struct pnfs_osd_ioerr4 {
* pnfs_osd_objid4 oer_component;
* length4 oer_comp_offset;
* length4 oer_comp_length;
* bool oer_iswrite;
* pnfs_osd_errno4 oer_errno;
* };
*/
struct pnfs_osd_ioerr {
struct pnfs_osd_objid oer_component;
u64 oer_comp_offset;
u64 oer_comp_length;
u32 oer_iswrite;
u32 oer_errno;
};
/* OSD XDR API */
/* Layout helpers */
/* Layout decoding is done in two parts:
* 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part
* of the layout. @iter members need not be initialized.
* Returned:
* @layout members are set. (@layout->olo_comps set to NULL).
*
* Zero on success, or negative error if passed xdr is broken.
*
* 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns
* false, to decode the next component.
* Returned:
* true if there is more to decode or false if we are done or error.
*
* Example:
* struct pnfs_osd_xdr_decode_layout_iter iter;
* struct pnfs_osd_layout layout;
* struct pnfs_osd_object_cred comp;
* int status;
*
* status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
* if (unlikely(status))
* goto err;
* while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) {
* // All of @comp strings point to inside the xdr_buffer
* // or scrach buffer. Copy them out to user memory eg.
* copy_single_comp(dest_comp++, &comp);
* }
* if (unlikely(status))
* goto err;
*/
struct pnfs_osd_xdr_decode_layout_iter {
unsigned total_comps;
unsigned decoded_comps;
};
extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr);
extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
int *err);
/* Device Info helpers */
/* Note: All strings inside @deviceaddr point to space inside @p.
* @p should stay valid while @deviceaddr is in use.
*/
extern void pnfs_osd_xdr_decode_deviceaddr(
struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
/* layoutupdate (layout_commit) xdr helpers */
extern int
pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
struct pnfs_osd_layoutupdate *lou);
/* osd_ioerror encoding/decoding (layout_return) */
/* Client */
extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr);
extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr);
#endif /* __PNFS_OSD_XDR_H__ */
...@@ -216,6 +216,8 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ...@@ -216,6 +216,8 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
unsigned int base, unsigned int len); unsigned int base, unsigned int len);
extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
struct page **pages, unsigned int len);
extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen); extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
......
...@@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) ...@@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
} }
EXPORT_SYMBOL_GPL(xdr_init_decode); EXPORT_SYMBOL_GPL(xdr_init_decode);
/**
* xdr_init_decode - Initialize an xdr_stream for decoding data.
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer from which to decode data
* @pages: list of pages to decode into
* @len: length in bytes of buffer in pages
*/
void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
struct page **pages, unsigned int len)
{
memset(buf, 0, sizeof(*buf));
buf->pages = pages;
buf->page_len = len;
buf->buflen = len;
buf->len = len;
xdr_init_decode(xdr, buf, NULL);
}
EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
{ {
__be32 *p = xdr->p; __be32 *p = xdr->p;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment