From c93407d03c3ccf60b33a309e5fcd41cd98969901 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 22 May 2011 19:49:06 +0300 Subject: pnfs-obj: objlayoutdriver module skeleton * Define the PNFS_OBJLAYOUT Kconfig option in the nfs master Kconfig file. * Add the objlayout driver to the Kernel's Kbuild system. * Add the fs/nfs/objlayout/Kbuild file for building the objlayoutdriver.ko driver * Define fs/nfs/objlayout/objio_osd.c, register the driver on module initialization and unregister on exit. [pnfs-obj: remove of CONFIG_PNFS fallout] Signed-off-by: Boaz Harrosh [added "unsure" clause] [depend on NFS_V4_1] Signed-off-by: Benny Halevy --- fs/nfs/objlayout/Kbuild | 5 +++ fs/nfs/objlayout/objio_osd.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 fs/nfs/objlayout/Kbuild create mode 100644 fs/nfs/objlayout/objio_osd.c (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild new file mode 100644 index 00000000000..2e5b9a44773 --- /dev/null +++ b/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 00000000000..379595f2d1c --- /dev/null +++ b/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,76 @@ +/* + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "../pnfs.h" + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy "); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + int ret = pnfs_register_layoutdriver(&objlayout_type); + + if (ret) + printk(KERN_INFO + "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", + __func__, ret); + else + printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", + __func__); + return ret; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +module_init(objlayout_init); +module_exit(objlayout_exit); -- cgit v1.2.3 From f1bc893a89d012649e4e7f43575b2c290e08ee42 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 22 May 2011 19:49:57 +0300 Subject: pnfs-obj: pnfs_osd XDR client implementation * Add the fs/nfs/objlayout/pnfs_osd_xdr_cli.c file, which will include the XDR encode/decode implementations for the pNFS client objlayout driver. [Wrong type in comments] Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- fs/nfs/objlayout/Kbuild | 2 +- fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 412 ++++++++++++++++++++++++++++++++++++ 2 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 fs/nfs/objlayout/pnfs_osd_xdr_cli.c (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild index 2e5b9a44773..7b2a5a24065 100644 --- a/fs/nfs/objlayout/Kbuild +++ b/fs/nfs/objlayout/Kbuild @@ -1,5 +1,5 @@ # # Makefile for the pNFS Objects Layout Driver kernel module # -objlayoutdriver-y := objio_osd.o +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 00000000000..16fc758e912 --- /dev/null +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,412 @@ +/* + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, + sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} +/* + * struct pnfs_osd_opaque_cred { + * u32 cred_len; + * void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 1); + + if (!p) + return -EINVAL; + + opaque_cred->cred_len = be32_to_cpu(*p++); + + p = xdr_inline_decode(xdr, opaque_cred->cred_len); + if (!p) + return -EINVAL; + + opaque_cred->cred = p; + return 0; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); + int ret; + + if (!p) + return -EIO; + + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p); + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); + if (unlikely(ret)) + return ret; + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); + return ret; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ + return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ + data_map->odm_num_comps = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); + data_map->odm_group_width = be32_to_cpup(p++); + data_map->odm_group_depth = be32_to_cpup(p++); + data_map->odm_mirror_cnt = be32_to_cpup(p++); + data_map->odm_raid_algorithm = be32_to_cpup(p++); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ + __be32 *p; + + memset(iter, 0, sizeof(*iter)); + + p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); + if (unlikely(!p)) + return -EINVAL; + + p = _osd_xdr_decode_data_map(p, &layout->olo_map); + layout->olo_comps_index = be32_to_cpup(p++); + layout->olo_num_comps = be32_to_cpup(p++); + iter->total_comps = layout->olo_num_comps; + return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, + int *err) +{ + BUG_ON(iter->decoded_comps > iter->total_comps); + if (iter->decoded_comps == iter->total_comps) + return false; + + *err = _osd_xdr_decode_object_cred(comp, xdr); + if (unlikely(*err)) { + dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " + "total_comps=%d\n", __func__, *err, + iter->decoded_comps, iter->total_comps); + return false; /* stop the loop */ + } + dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + + iter->decoded_comps++; + return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + * variable strings fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + * unsigned int len; + * char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ + str->len = be32_to_cpup(p++); + str->data = (char *)p; + + p += XDR_QUADLEN(str->len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ + u32 oti_type; + + oti_type = be32_to_cpup(p++); + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ + p = __read_u8_opaque(p, &netaddr->r_netid); + p = __read_u8_opaque(p, &netaddr->r_addr); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ + u32 ota_available; + + ota_available = be32_to_cpup(p++); + targetaddr->ota_available = ota_available; + + if (ota_available) + p = __read_net_addr(p, &targetaddr->ota_netaddr); + + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + opaque_cred->cred_len = be32_to_cpu(*p++); + opaque_cred->cred = p; + return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p++); + + p = __read_opaque_cred(p, &comp->oc_cap_key); + p = __read_opaque_cred(p, &comp->oc_cap); + return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ + p = __read_targetid(p, &deviceaddr->oda_targetid); + + p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + + p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, + sizeof(deviceaddr->oda_lun)); + + p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + + p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + + p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + + /* libosd likes this terminated in dbg. It's last, so no problems */ + deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ + p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32 + 24); + if (unlikely(!p)) + dprintk("%s: out of xdr space\n", __func__); + + return p; +} -- cgit v1.2.3 From 09f5bf4e6d0607399c16ec7a2d8d166f31086686 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 22 May 2011 19:50:20 +0300 Subject: pnfs-obj: decode layout, alloc/free lseg objlayout_alloc_lseg prepares an xdr_stream and calls the raid engins objio_alloc_lseg() to allocate a private pnfs_layout_segment. objio_osd.c::objio_alloc_lseg() uses passed xdr_stream to decode and store the layout_segment information in an objio_segment struct, using the pnfs_osd_xdr.h API for the actual parsing the layout xdr. objlayout_free_lseg calls objio_free_lseg() to free the allocated space. Signed-off-by: Boaz Harrosh [gfp_flags] [removed "extern" from function definitions] Signed-off-by: Benny Halevy --- fs/nfs/objlayout/Kbuild | 2 +- fs/nfs/objlayout/objio_osd.c | 170 ++++++++++++++++++++++++++++++++++++++++++- fs/nfs/objlayout/objlayout.c | 104 ++++++++++++++++++++++++++ fs/nfs/objlayout/objlayout.h | 67 +++++++++++++++++ 4 files changed, 341 insertions(+), 2 deletions(-) create mode 100644 fs/nfs/objlayout/objlayout.c create mode 100644 fs/nfs/objlayout/objlayout.h (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild index 7b2a5a24065..ed30ea072bb 100644 --- a/fs/nfs/objlayout/Kbuild +++ b/fs/nfs/objlayout/Kbuild @@ -1,5 +1,5 @@ # # Makefile for the pNFS Objects Layout Driver kernel module # -objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 379595f2d1c..08f1d90b4ce 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,11 +38,179 @@ */ #include -#include "../pnfs.h" +#include + +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define _LLU(x) ((unsigned long long)x) + +struct caps_buffers { + u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; + u8 creds[OSD_CAP_LEN]; +}; + +struct objio_segment { + struct pnfs_layout_segment lseg; + + struct pnfs_osd_object_cred *comps; + + unsigned mirrors_p1; + unsigned stripe_unit; + unsigned group_width; /* Data stripe_units without integrity comps */ + u64 group_depth; + unsigned group_count; + + unsigned comps_index; + unsigned num_comps; + /* variable length */ + struct objio_dev_ent *ods[]; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, struct objio_segment, lseg); +} + +static int _verify_data_map(struct pnfs_osd_layout *layout) +{ + struct pnfs_osd_data_map *data_map = &layout->olo_map; + u64 stripe_length; + u32 group_width; + +/* FIXME: Only raid0 for now. if not go through MDS */ + if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { + printk(KERN_ERR "Only RAID_0 for now\n"); + return -ENOTSUPP; + } + if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { + printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", + data_map->odm_num_comps, data_map->odm_mirror_cnt); + return -EINVAL; + } + + if (data_map->odm_group_width) + group_width = data_map->odm_group_width; + else + group_width = data_map->odm_num_comps / + (data_map->odm_mirror_cnt + 1); + + stripe_length = (u64)data_map->odm_stripe_unit * group_width; + if (stripe_length >= (1ULL << 32)) { + printk(KERN_ERR "Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -ENOTSUPP; + } + + if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { + printk(KERN_ERR "Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(data_map->odm_stripe_unit), PAGE_SIZE); + return -ENOTSUPP; + } + + return 0; +} + +static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, + struct pnfs_osd_object_cred *src_comp, + struct caps_buffers *caps_p) +{ + WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); + WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); + + *cur_comp = *src_comp; + + memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, + sizeof(caps_p->caps_key)); + cur_comp->oc_cap_key.cred = caps_p->caps_key; + + memcpy(caps_p->creds, src_comp->oc_cap.cred, + sizeof(caps_p->creds)); + cur_comp->oc_cap.cred = caps_p->creds; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg; + struct pnfs_osd_xdr_decode_layout_iter iter; + struct pnfs_osd_layout layout; + struct pnfs_osd_object_cred *cur_comp, src_comp; + struct caps_buffers *caps_p; + int err; + + err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); + if (unlikely(err)) + return err; + + err = _verify_data_map(&layout); + if (unlikely(err)) + return err; + + objio_seg = kzalloc(sizeof(*objio_seg) + + sizeof(objio_seg->ods[0]) * layout.olo_num_comps + + sizeof(*objio_seg->comps) * layout.olo_num_comps + + sizeof(struct caps_buffers) * layout.olo_num_comps, + gfp_flags); + if (!objio_seg) + return -ENOMEM; + + objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); + cur_comp = objio_seg->comps; + caps_p = (void *)(cur_comp + layout.olo_num_comps); + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) + copy_single_comp(cur_comp++, &src_comp, caps_p++); + if (unlikely(err)) + goto err; + + objio_seg->num_comps = layout.olo_num_comps; + objio_seg->comps_index = layout.olo_comps_index; + + objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; + if (layout.olo_map.odm_group_width) { + objio_seg->group_width = layout.olo_map.odm_group_width; + objio_seg->group_depth = layout.olo_map.odm_group_depth; + objio_seg->group_count = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1 / + objio_seg->group_width; + } else { + objio_seg->group_width = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1; + objio_seg->group_depth = -1; + objio_seg->group_count = 1; + } + + *outp = &objio_seg->lseg; + return 0; + +err: + kfree(objio_seg); + dprintk("%s: Error: return %d\n", __func__, err); + *outp = NULL; + return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + + kfree(objio_seg); +} + static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", + + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, }; MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c new file mode 100644 index 00000000000..946526763d3 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,104 @@ +/* + * pNFS Objects layout driver high level definitions + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + int status = -ENOMEM; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + + scratch = alloc_page(gfp_flags); + if (!scratch) + goto err_nofree; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); + if (unlikely(status)) { + dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, + status); + goto err; + } + + __free_page(scratch); + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + +err: + __free_page(scratch); +err_nofree: + dprintk("%s: Err Return=>%d\n", __func__, status); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objio_free_lseg(lseg); +} + diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h new file mode 100644 index 00000000000..066280a07fa --- /dev/null +++ b/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,67 @@ +/* + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include +#include +#include "../pnfs.h" + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +/* + * exported generic objects function vectors + */ +extern struct pnfs_layout_segment *objlayout_alloc_lseg( + struct pnfs_layout_hdr *, + struct nfs4_layoutget_res *, + gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +#endif /* _OBJLAYOUT_H */ -- cgit v1.2.3 From b6c05f1693115164c7b797152ac7ea3ef8e5d296 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 26 May 2011 21:45:34 +0300 Subject: pnfs-obj: objio_osd device information retrieval and caching When a new layout is received in objio_alloc_lseg all device_ids referenced are retrieved. The device information is queried for from MDS and then the osd_device is looked-up from the osd-initiator library. The devices are cached in a per-mount-point list, for later use. At unmount all devices are "put" back to the library. objlayout_get_deviceinfo(), objlayout_put_deviceinfo() middleware API for retrieving device information given a device_id. TODO: The device cache can get big. Cap its size. Keep an LRU and start to return devices which were not used, when list gets to big, or when new entries allocation fail. [pnfs-obj: Bugs in new global-device-cache code] Signed-off-by: Boaz Harrosh [gfp_flags] [use global device cache] [use layout driver in global device cache] Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 157 +++++++++++++++++++++++++++++++++++++++++++ fs/nfs/objlayout/objlayout.c | 68 +++++++++++++++++++ fs/nfs/objlayout/objlayout.h | 8 +++ 3 files changed, 233 insertions(+) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 08f1d90b4ce..2255e2d22d0 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -46,6 +46,68 @@ #define _LLU(x) ((unsigned long long)x) +struct objio_dev_ent { + struct nfs4_deviceid_node id_node; + struct osd_dev *od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + + dprintk("%s: free od=%p\n", __func__, de->od); + osduld_put_device(de->od); + kfree(de); +} + +static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de; + + d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); + if (!d) + return NULL; + + de = container_of(d, struct objio_dev_ent, id_node); + return de; +} + +static struct objio_dev_ent * +_dev_list_add(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id, struct osd_dev *od, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); + struct objio_dev_ent *n; + + if (!de) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + return NULL; + } + + dprintk("%s: Adding od=%p\n", __func__, od); + nfs4_init_deviceid_node(&de->id_node, + nfss->pnfs_curr_ld, + nfss->nfs_client, + d_id); + de->od = od; + + d = nfs4_insert_deviceid_node(&de->id_node); + n = container_of(d, struct objio_dev_ent, id_node); + if (n != de) { + dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + objio_free_deviceid_node(&de->id_node); + de = n; + } + + atomic_inc(&de->id_node.ref); + return de; +} + struct caps_buffers { u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; u8 creds[OSD_CAP_LEN]; @@ -74,6 +136,90 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned comp, + gfp_t gfp_flags) +{ + struct pnfs_osd_deviceaddr *deviceaddr; + struct nfs4_deviceid *d_id; + struct objio_dev_ent *ode; + struct osd_dev *od; + struct osd_dev_info odi; + int err; + + d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; + + ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); + if (ode) + return ode; + + err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); + if (unlikely(err)) { + dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", + __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); + return ERR_PTR(err); + } + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + goto out; + } + + ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, + gfp_flags); + +out: + dprintk("%s: return=%d\n", __func__, err); + objlayout_put_deviceinfo(deviceaddr); + return err ? ERR_PTR(err) : ode; +} + +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, + gfp_t gfp_flags) +{ + unsigned i; + int err; + + /* lookup all devices */ + for (i = 0; i < objio_seg->num_comps; i++) { + struct objio_dev_ent *ode; + + ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); + if (unlikely(IS_ERR(ode))) { + err = PTR_ERR(ode); + goto out; + } + objio_seg->ods[i] = ode; + } + err = 0; + +out: + dprintk("%s: return=%d\n", __func__, err); + return err; +} + static int _verify_data_map(struct pnfs_osd_layout *layout) { struct pnfs_osd_data_map *data_map = &layout->olo_map; @@ -171,6 +317,9 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, objio_seg->num_comps = layout.olo_num_comps; objio_seg->comps_index = layout.olo_comps_index; + err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); + if (err) + goto err; objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; @@ -199,8 +348,14 @@ err: void objio_free_lseg(struct pnfs_layout_segment *lseg) { + int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + for (i = 0; i < objio_seg->num_comps; i++) { + if (!objio_seg->ods[i]) + break; + nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + } kfree(objio_seg); } @@ -211,6 +366,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .alloc_lseg = objlayout_alloc_lseg, .free_lseg = objlayout_free_lseg, + + .free_deviceid_node = objio_free_deviceid_node, }; MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 946526763d3..10e5fca3b7f 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -102,3 +102,71 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg) objio_free_lseg(lseg); } +/* + * Get Device Info API for io engines + */ +struct objlayout_deviceinfo { + struct page *page; + struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags) +{ + struct objlayout_deviceinfo *odi; + struct pnfs_device pd; + struct super_block *sb; + struct page *page, **pages; + u32 *p; + int err; + + page = alloc_page(gfp_flags); + if (!page) + return -ENOMEM; + + pages = &page; + pd.pages = pages; + + memcpy(&pd.dev_id, d_id, sizeof(*d_id)); + pd.layout_type = LAYOUT_OSD2_OBJECTS; + pd.pages = &page; + pd.pgbase = 0; + pd.pglen = PAGE_SIZE; + pd.mincount = 0; + + sb = pnfslay->plh_inode->i_sb; + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); + dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); + if (err) + goto err_out; + + p = page_address(page); + odi = kzalloc(sizeof(*odi), gfp_flags); + if (!odi) { + err = -ENOMEM; + goto err_out; + } + pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); + odi->page = page; + *deviceaddr = &odi->da; + return 0; + +err_out: + __free_page(page); + return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ + struct objlayout_deviceinfo *odi = container_of(deviceaddr, + struct objlayout_deviceinfo, + da); + + __free_page(odi->page); + kfree(odi); +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 066280a07fa..0814271bb9b 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -55,6 +55,14 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); +/* + * callback API + */ +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + /* * exported generic objects function vectors */ -- cgit v1.2.3 From e51b841dd0be9ff53f740c44c32c32679edcb7c8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 22 May 2011 19:51:48 +0300 Subject: pnfs-obj: define per-inode private structure allocate and deallocate per-inode private pnfs_layout_hdr in preparation for I/O implementation. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 3 +++ fs/nfs/objlayout/objlayout.c | 26 ++++++++++++++++++++++++++ fs/nfs/objlayout/objlayout.h | 17 +++++++++++++++++ 3 files changed, 46 insertions(+) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 2255e2d22d0..353821f7937 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -364,6 +364,9 @@ static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, + .alloc_lseg = objlayout_alloc_lseg, .free_lseg = objlayout_free_lseg, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 10e5fca3b7f..f14b4da3405 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -41,6 +41,32 @@ #include "objlayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + kfree(objlay); +} + /* * Unmarshall layout and store it in pnfslay. */ diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 0814271bb9b..fa0262149f5 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -45,6 +45,19 @@ #include #include "../pnfs.h" +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + /* * Raid engine I/O API */ @@ -66,6 +79,10 @@ extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); /* * exported generic objects function vectors */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + extern struct pnfs_layout_segment *objlayout_alloc_lseg( struct pnfs_layout_hdr *, struct nfs4_layoutget_res *, -- cgit v1.2.3 From 04f83450388e87d86b387cf4a27b81eb7e69de7d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 22 May 2011 19:52:19 +0300 Subject: pnfs-obj: osd raid engine read/write implementation With the use of the in-kernel osd library. Implement read/write of data from/to osd-objects according to information specified in the objects-layout. Support for stripping over mirrors with a received stripe_unit. There are however a few constrains which are not supported: 1. Stripe Unit must be a multiple of PAGE_SIZE 2. stripe length (stripe_unit * number_of_stripes) can not be bigger then 32bit. Also support raid-groups and partial-layout. Partial-layout is when not all the groups are received on the line, addressing only a partial range of the file. TODO: Only raid0! raid 4/5/6 support will come at later stage A none supported layout will send IO through the MDS [Important fallout from the last rebase] Signed-off-by: Boaz Harrosh [gfp_flags] Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 605 +++++++++++++++++++++++++++++++++++++++++++ fs/nfs/objlayout/objlayout.c | 254 ++++++++++++++++++ fs/nfs/objlayout/objlayout.h | 42 +++ 3 files changed, 901 insertions(+) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 353821f7937..cc92d3b3dc3 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -46,6 +46,10 @@ #define _LLU(x) ((unsigned long long)x) +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; + struct objio_dev_ent { struct nfs4_deviceid_node id_node; struct osd_dev *od; @@ -136,6 +140,31 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } +struct objio_state; +typedef ssize_t (*objio_done_fn)(struct objio_state *ios); + +struct objio_state { + /* Generic layer */ + struct objlayout_io_state ol_state; + + struct objio_segment *layout; + + struct kref kref; + objio_done_fn done; + void *private; + + unsigned long length; + unsigned numdevs; /* Actually used devs in this IO */ + /* A per-device variable array of size numdevs */ + struct _objio_per_comp { + struct bio *bio; + struct osd_request *or; + unsigned long length; + u64 offset; + unsigned dev; + } per_dev[]; +}; + /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, @@ -359,6 +388,578 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) kfree(objio_seg); } +int objio_alloc_io_state(struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + struct objio_state *ios; + const unsigned first_size = sizeof(*ios) + + objio_seg->num_comps * sizeof(ios->per_dev[0]); + + ios = kzalloc(first_size, gfp_flags); + if (unlikely(!ios)) + return -ENOMEM; + + ios->layout = objio_seg; + + *outp = &ios->ol_state; + return 0; +} + +void objio_free_io_state(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + + kfree(ios); +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +static int _io_check(struct objio_state *ios, bool is_write) +{ + enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; + int lin_ret = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct osd_request *or = ios->per_dev[i].or; + unsigned dev; + int ret; + + if (!or) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + BUG_ON(is_write); + _clear_bio(ios->per_dev[i].bio); + dprintk("%s: start read offset passed end of file " + "offset=0x%llx, length=0x%lx\n", __func__, + _LLU(ios->per_dev[i].offset), + ios->per_dev[i].length); + + continue; /* we recovered */ + } + dev = ios->per_dev[i].dev; + + if (osi.osd_err_pri >= oep) { + oep = osi.osd_err_pri; + lin_ret = ret; + } + } + + return lin_ret; +} + +/* + * Common IO state helpers. + */ +static void _io_free(struct objio_state *ios) +{ + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct _objio_per_comp *per_dev = &ios->per_dev[i]; + + if (per_dev->or) { + osd_end_request(per_dev->or); + per_dev->or = NULL; + } + + if (per_dev->bio) { + bio_put(per_dev->bio); + per_dev->bio = NULL; + } + } +} + +struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) +{ + unsigned min_dev = ios->layout->comps_index; + unsigned max_dev = min_dev + ios->layout->num_comps; + + BUG_ON(dev < min_dev || max_dev <= dev); + return ios->layout->ods[dev - min_dev]->od; +} + +struct _striping_info { + u64 obj_offset; + u64 group_length; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, + struct _striping_info *si) +{ + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodU = file_offset - M * S; + u32 G = div64_u64(LmodU, T); + u64 H = LmodU - G * T; + + u32 N = div_u64(H, U); + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; + + si->group_length = T - H; +} + +static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, + unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, + gfp_t gfp_flags) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_io_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned stripes = ios->layout->num_comps / + ios->layout->mirrors_p1; + unsigned pages_in_stripe = stripes * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + stripes; + + if (BIO_MAX_PAGES_KMALLOC < bio_size) + bio_size = BIO_MAX_PAGES_KMALLOC; + + per_dev->bio = bio_kmalloc(gfp_flags, bio_size); + if (unlikely(!per_dev->bio)) { + dprintk("Faild to allocate BIO size=%u\n", bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + BUG_ON(ios->ol_state.nr_pages <= pg); + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, + ios->ol_state.pages[pg], pglen, pgbase); + if (unlikely(pglen != added_len)) + return -ENOMEM; + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + *cur_pg = pg; + return 0; +} + +static int _prepare_one_group(struct objio_state *ios, u64 length, + struct _striping_info *si, unsigned *last_pg, + gfp_t gfp_flags) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = *last_pg; + int ret = 0; + + while (length) { + struct _objio_per_comp *per_dev = &ios->per_dev[dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && + (page_off != ios->ol_state.pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + + if (max_comp < dev) + max_comp = dev; + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len, gfp_flags); + if (unlikely(ret)) + goto out; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + + length -= cur_len; + ios->length += cur_len; + } +out: + ios->numdevs = max_comp + mirrors_p1; + *last_pg = cur_pg; + return ret; +} + +static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) +{ + u64 length = ios->ol_state.count; + u64 offset = ios->ol_state.offset; + struct _striping_info si; + unsigned last_pg = 0; + int ret = 0; + + while (length) { + _calc_stripe_info(ios, offset, &si); + + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); + if (unlikely(ret)) + goto out; + + offset += si.group_length; + length -= si.group_length; + } + +out: + if (!ios->length) + return ret; + + return 0; +} + +static ssize_t _sync_done(struct objio_state *ios) +{ + struct completion *waiting = ios->private; + + complete(waiting); + return 0; +} + +static void _last_io(struct kref *kref) +{ + struct objio_state *ios = container_of(kref, struct objio_state, kref); + + ios->done(ios); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct objio_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static ssize_t _io_exec(struct objio_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t status = 0; /* sync status */ + unsigned i; + objio_done_fn saved_done_fn = ios->done; + bool sync = ios->ol_state.sync; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + + if (!or) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + + if (sync) { + wait_for_completion(&wait); + status = saved_done_fn(ios); + } + + return status; +} + +/* + * read + */ +static ssize_t _read_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, false); + + _io_free(ios); + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct osd_request *or = NULL; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + unsigned dev = per_dev->dev; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + int ret; + + or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + +err: + return ret; +} + +static ssize_t _read_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _read_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _read_done; + return _io_exec(ios); /* In sync mode exec returns the io status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + ret = _io_rw_pagelist(ios, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + return _read_exec(ios); +} + +/* + * write + */ +static ssize_t _write_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, true); + + _io_free(ios); + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + ios->ol_state.committed = NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret; + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct osd_request *or = NULL; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + struct bio *bio; + + or = osd_start_request(_io_od(ios, dev), GFP_NOFS); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_NOFS, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + dprintk("Faild to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto err; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->bio = bio; + per_dev->dev = dev; + per_dev->length = master_dev->length; + per_dev->offset = master_dev->offset; + } else { + bio = master_dev->bio; + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + } + +err: + return ret; +} + +static ssize_t _write_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _write_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _write_done; + return _io_exec(ios); /* In sync mode exec returns the io->status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + /* TODO: ios->stable = stable; */ + ret = _io_rw_pagelist(ios, GFP_NOFS); + if (unlikely(ret)) + return ret; + + return _write_exec(ios); +} + +/* + * objlayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return 1 : coalesce page + * return 0 : don't coalesce page + */ +int +objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + return 1; +} static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, @@ -370,6 +971,10 @@ static struct pnfs_layoutdriver_type objlayout_type = { .alloc_lseg = objlayout_alloc_lseg, .free_lseg = objlayout_free_lseg, + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .pg_test = objlayout_pg_test, + .free_deviceid_node = objio_free_deviceid_node, }; diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index f14b4da3405..5157ef6d004 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -128,6 +128,260 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg) objio_free_lseg(lseg); } +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +static struct objlayout_io_state * +objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct page **pages, + unsigned pgbase, + loff_t offset, + size_t count, + struct pnfs_layout_segment *lseg, + void *rpcdata, + gfp_t gfp_flags) +{ + struct objlayout_io_state *state; + u64 lseg_end_offset; + + dprintk("%s: allocating io_state\n", __func__); + if (objio_alloc_io_state(lseg, &state, gfp_flags)) + return NULL; + + BUG_ON(offset < lseg->pls_range.offset); + lseg_end_offset = end_offset(lseg->pls_range.offset, + lseg->pls_range.length); + BUG_ON(offset >= lseg_end_offset); + if (offset + count > lseg_end_offset) { + count = lseg->pls_range.length - + (offset - lseg->pls_range.offset); + dprintk("%s: truncated count %Zd\n", __func__, count); + } + + if (pgbase > PAGE_SIZE) { + pages += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; + } + + state->lseg = lseg; + state->rpcdata = rpcdata; + state->pages = pages; + state->pgbase = pgbase; + state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + state->offset = offset; + state->count = count; + state->sync = 0; + + return state; +} + +static void +objlayout_free_io_state(struct objlayout_io_state *state) +{ + dprintk("%s: freeing io_state\n", __func__); + if (unlikely(!state)) + return; + + objio_free_io_state(state); +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_state *state) +{ + dprintk("%s: state %p status\n", __func__, state); + + objlayout_free_io_state(state); +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + + pnfs_ld_read_done(rdata); +} + +void +objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +{ + int eof = state->eof; + struct nfs_read_data *rdata; + + state->status = status; + dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + rdata = state->rpcdata; + rdata->task.tk_status = status; + if (status >= 0) { + rdata->res.count = status; + rdata->res.eof = eof; + } + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_read_done(rdata); + else { + INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); + schedule_work(&rdata->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_read_data *rdata) +{ + loff_t offset = rdata->args.offset; + size_t count = rdata->args.count; + struct objlayout_io_state *state; + ssize_t status = 0; + loff_t eof; + + dprintk("%s: Begin inode %p offset %llu count %d\n", + __func__, rdata->inode, offset, (int)count); + + eof = i_size_read(rdata->inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + status = 0; + rdata->res.count = 0; + rdata->res.eof = 1; + goto out; + } + count = eof - offset; + } + + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->args.pages, rdata->args.pgbase, + offset, count, + rdata->lseg, rdata, + GFP_KERNEL); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->eof = state->offset + state->count >= eof; + + status = objio_read_pagelist(state); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + rdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + + pnfs_ld_write_done(wdata); +} + +void +objlayout_write_done(struct objlayout_io_state *state, ssize_t status, + bool sync) +{ + struct nfs_write_data *wdata; + + dprintk("%s: Begin\n", __func__); + wdata = state->rpcdata; + state->status = status; + wdata->task.tk_status = status; + if (status >= 0) { + wdata->res.count = status; + wdata->verf.committed = state->committed; + dprintk("%s: Return status %d committed %d\n", + __func__, wdata->task.tk_status, + wdata->verf.committed); + } else + dprintk("%s: Return status %d\n", + __func__, wdata->task.tk_status); + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_write_done(wdata); + else { + INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); + schedule_work(&wdata->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_write_data *wdata, + int how) +{ + struct objlayout_io_state *state; + ssize_t status; + + dprintk("%s: Begin inode %p offset %llu count %u\n", + __func__, wdata->inode, wdata->args.offset, wdata->args.count); + + state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->args.pages, + wdata->args.pgbase, + wdata->args.offset, + wdata->args.count, + wdata->lseg, wdata, + GFP_NOFS); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->sync = how & FLUSH_SYNC; + + status = objio_write_pagelist(state, how & FLUSH_STABLE); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + wdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + /* * Get Device Info API for io engines */ diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fa0262149f5..9a405e8069f 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -58,6 +58,26 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) return container_of(lo, struct objlayout, pnfs_layout); } +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_state { + struct pnfs_layout_segment *lseg; + + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + + void *rpcdata; + int status; /* res */ + int eof; /* res */ + int committed; /* res */ +}; + /* * Raid engine I/O API */ @@ -68,9 +88,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); +extern int objio_alloc_io_state( + struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags); +extern void objio_free_io_state(struct objlayout_io_state *state); + +extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); +extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, + bool stable); + /* * callback API */ +extern void objlayout_read_done(struct objlayout_io_state *state, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_state *state, + ssize_t status, bool sync); + extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, gfp_t gfp_flags); @@ -89,4 +124,11 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( gfp_t gfp_flags); extern void objlayout_free_lseg(struct pnfs_layout_segment *); +extern enum pnfs_try_status objlayout_read_pagelist( + struct nfs_read_data *); + +extern enum pnfs_try_status objlayout_write_pagelist( + struct nfs_write_data *, + int how); + #endif /* _OBJLAYOUT_H */ -- cgit v1.2.3 From 8a1636c459cb7a4b32ba4024cd1b2ba21fba6aed Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 14 Jul 2010 15:43:57 -0400 Subject: pnfs: layoutret_on_setattr With the objects layout security model, we have object capabilities that are associated with the layout and we anticipate that the server will issue a cb_layoutrecall for any setattr that changes security related attributes (user/group/mode/acl) or truncates the file. Therefore, the layout is returned before issuing the setattr to avoid the anticipated cb_layoutrecall. Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index cc92d3b3dc3..4e8de3ec9a6 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -964,6 +964,7 @@ objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", + .flags = PNFS_LAYOUTRET_ON_SETATTR, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, -- cgit v1.2.3 From adb58535e604a564495a7d50dfb0afa0ddc21bcb Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 26 May 2011 21:49:46 +0300 Subject: pnfs-obj: report errors and .encode_layoutreturn Implementation. An io_state pre-allocates an error information structure for each possible osd-device that might error during IO. When IO is done if all was well the io_state is freed. (as today). If the I/O has ended with an error, the io_state is queued on a per-layout err_list. When eventually encode_layoutreturn() is called, each error is properly encoded on the XDR buffer and only then the io_state is removed from err_list and de-allocated. It is up to the io_engine to fill in the segment that fault and the type of osd_error that occurred. By calling objlayout_io_set_result() for each failing device. In objio_osd: * Allocate io-error descriptors space as part of io_state * Use generic objlayout error reporting at end of io. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 44 +++++++- fs/nfs/objlayout/objlayout.c | 232 ++++++++++++++++++++++++++++++++++++++++++- fs/nfs/objlayout/objlayout.h | 23 +++++ 3 files changed, 297 insertions(+), 2 deletions(-) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 4e8de3ec9a6..8bca5e13f3e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -396,12 +396,16 @@ int objio_alloc_io_state(struct pnfs_layout_segment *lseg, struct objio_state *ios; const unsigned first_size = sizeof(*ios) + objio_seg->num_comps * sizeof(ios->per_dev[0]); + const unsigned sec_size = objio_seg->num_comps * + sizeof(ios->ol_state.ioerrs[0]); - ios = kzalloc(first_size, gfp_flags); + ios = kzalloc(first_size + sec_size, gfp_flags); if (unlikely(!ios)) return -ENOMEM; ios->layout = objio_seg; + ios->ol_state.ioerrs = ((void *)ios) + first_size; + ios->ol_state.num_comps = objio_seg->num_comps; *outp = &ios->ol_state; return 0; @@ -415,6 +419,36 @@ void objio_free_io_state(struct objlayout_io_state *ol_state) kfree(ios); } +enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + static void _clear_bio(struct bio *bio) { struct bio_vec *bv; @@ -461,6 +495,12 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } dev = ios->per_dev[i].dev; + objlayout_io_set_result(&ios->ol_state, dev, + &ios->layout->comps[dev].oc_object_id, + osd_pri_2_pnfs_err(osi.osd_err_pri), + ios->per_dev[i].offset, + ios->per_dev[i].length, + is_write); if (osi.osd_err_pri >= oep) { oep = osi.osd_err_pri; @@ -977,6 +1017,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_test = objlayout_pg_test, .free_deviceid_node = objio_free_deviceid_node, + + .encode_layoutreturn = objlayout_encode_layoutreturn, }; MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 5157ef6d004..f7caecff6b4 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -50,6 +50,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct objlayout *objlay; objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + if (objlay) { + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + } dprintk("%s: Return %p\n", __func__, objlay); return &objlay->pnfs_layout; } @@ -64,6 +68,7 @@ objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) dprintk("%s: objlay %p\n", __func__, objlay); + WARN_ON(!list_empty(&objlay->err_list)); kfree(objlay); } @@ -183,6 +188,7 @@ objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, pgbase &= ~PAGE_MASK; } + INIT_LIST_HEAD(&state->err_list); state->lseg = lseg; state->rpcdata = rpcdata; state->pages = pages; @@ -213,7 +219,52 @@ objlayout_iodone(struct objlayout_io_state *state) { dprintk("%s: state %p status\n", __func__, state); - objlayout_free_io_state(state); + if (likely(state->status >= 0)) { + objlayout_free_io_state(state); + } else { + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + spin_lock(&objlay->lock); + list_add(&objlay->err_list, &state->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, + struct pnfs_osd_objid *pooid, int osd_error, + u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + + BUG_ON(index >= state->num_comps); + if (osd_error) { + ioerr->oer_component = *pooid; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } } /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). @@ -382,6 +433,185 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, return PNFS_ATTEMPTED; } +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ + struct objlayout_io_state *state, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } + + pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_state *state, *tmp; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + __be32 *last_xdr = NULL, *p; + unsigned i; + int res = 0; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + p = pnfs_osd_xdr_ioerr_reserve_space(xdr); + if (unlikely(!p)) { + res = -E2BIG; + break; /* accumulated_error */ + } + + last_xdr = p; + pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + } + + /* TODO: use xdr_write_pages */ + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(!last_xdr); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + encode_accumulated_error(objlay, last_xdr); + goto loop_done; + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + + /* * Get Device Info API for io engines */ diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 9a405e8069f..b0bb975058e 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -50,6 +50,10 @@ */ struct objlayout { struct pnfs_layout_hdr pnfs_layout; + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; }; static inline struct objlayout * @@ -76,6 +80,16 @@ struct objlayout_io_state { int status; /* res */ int eof; /* res */ int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; }; /* @@ -101,6 +115,10 @@ extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, /* * callback API */ +extern void objlayout_io_set_result(struct objlayout_io_state *state, + unsigned index, struct pnfs_osd_objid *pooid, + int osd_error, u64 offset, u64 length, bool is_write); + extern void objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync); extern void objlayout_write_done(struct objlayout_io_state *state, @@ -131,4 +149,9 @@ extern enum pnfs_try_status objlayout_write_pagelist( struct nfs_write_data *, int how); +extern void objlayout_encode_layoutreturn( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutreturn_args *); + #endif /* _OBJLAYOUT_H */ -- cgit v1.2.3 From a0fe8bf427f4987d7b82678292ca03cfd7331467 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 22 May 2011 19:54:13 +0300 Subject: pnfs-obj: objlayout_encode_layoutcommit implementation * Define API for io-engines to report delta_space_used in IOs * Encode the osd-layout specific information of the layoutcommit XDR buffer. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 1 + fs/nfs/objlayout/objlayout.c | 30 ++++++++++++++++++++++++++++++ fs/nfs/objlayout/objlayout.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8bca5e13f3e..3be124160e9 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1018,6 +1018,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .free_deviceid_node = objio_free_deviceid_node, + .encode_layoutcommit = objlayout_encode_layoutcommit, .encode_layoutreturn = objlayout_encode_layoutreturn, }; diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index f7caecff6b4..dc3956c0de8 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -225,6 +225,7 @@ objlayout_iodone(struct objlayout_io_state *state) struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; list_add(&objlay->err_list, &state->err_list); spin_unlock(&objlay->lock); } @@ -433,6 +434,35 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, return PNFS_ATTEMPTED; } +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + static int err_prio(u32 oer_errno) { diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index b0bb975058e..a8244c8e042 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -51,6 +51,14 @@ struct objlayout { struct pnfs_layout_hdr pnfs_layout; + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + /* for layout_return */ spinlock_t lock; struct list_head err_list; @@ -119,6 +127,23 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); +static inline void +objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +{ + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + extern void objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync); extern void objlayout_write_done(struct objlayout_io_state *state, @@ -149,6 +174,11 @@ extern enum pnfs_try_status objlayout_write_pagelist( struct nfs_write_data *, int how); +extern void objlayout_encode_layoutcommit( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutcommit_args *); + extern void objlayout_encode_layoutreturn( struct pnfs_layout_hdr *, struct xdr_stream *, -- cgit v1.2.3 From 18ad0a9f2ccd260d37dd6bc5fa04c7819def4c84 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 25 May 2011 21:03:56 +0300 Subject: NFSv4.1: change pg_test return type to bool Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 3be124160e9..8c2bd3eb8e8 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -991,14 +991,13 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) /* * objlayout_pg_test(). Called by nfs_can_coalesce_requests() * - * return 1 : coalesce page - * return 0 : don't coalesce page + * return true iff coalesce page */ -int +bool objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - return 1; + return true; } static struct pnfs_layoutdriver_type objlayout_type = { -- cgit v1.2.3 From 89a58e32d9105c01022a757fb32ddc3b51bf0025 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 25 May 2011 20:54:40 +0300 Subject: NFSv4.1: use pnfs_generic_pg_test directly by layout driver Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8c2bd3eb8e8..3b10d09e521 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -988,18 +988,6 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) return _write_exec(ios); } -/* - * objlayout_pg_test(). Called by nfs_can_coalesce_requests() - * - * return true iff coalesce page - */ -bool -objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, - struct nfs_page *req) -{ - return true; -} - static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", @@ -1013,7 +1001,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .read_pagelist = objlayout_read_pagelist, .write_pagelist = objlayout_write_pagelist, - .pg_test = objlayout_pg_test, + .pg_test = pnfs_generic_pg_test, .free_deviceid_node = objio_free_deviceid_node, -- cgit v1.2.3 From 9342077011d54f42fa1b88b7bc1f7008dcf5fff9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 25 May 2011 21:25:29 +0300 Subject: pnfs-obj: pg_test check for max_io_size Implement pg_test vector to test for max IO sizes. We calculate a max_io_size member only once, and cache it in lseg so to not do so on every page insert. Signed-off-by: Boaz Harrosh [simplify logic] Signed-off-by: Benny Halevy --- fs/nfs/objlayout/objio_osd.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs/nfs/objlayout') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 3b10d09e521..9cf208df1f2 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -128,6 +128,8 @@ struct objio_segment { u64 group_depth; unsigned group_count; + unsigned max_io_size; + unsigned comps_index; unsigned num_comps; /* variable length */ @@ -365,6 +367,11 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, objio_seg->group_count = 1; } + /* Cache this calculation it will hit for every page */ + objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - + objio_seg->stripe_unit) * + objio_seg->group_width; + *outp = &objio_seg->lseg; return 0; @@ -988,6 +995,16 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) return _write_exec(ios); } +static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + if (!pnfs_generic_pg_test(pgio, prev, req)) + return false; + + return pgio->pg_count + req->wb_bytes <= + OBJIO_LSEG(pgio->pg_lseg)->max_io_size; +} + static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", @@ -1001,7 +1018,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .read_pagelist = objlayout_read_pagelist, .write_pagelist = objlayout_write_pagelist, - .pg_test = pnfs_generic_pg_test, + .pg_test = objio_pg_test, .free_deviceid_node = objio_free_deviceid_node, -- cgit v1.2.3