diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 898edd9edbd7..26472e59f615 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -75,11 +75,24 @@ compress_slashes(const char *src, char *dest) *dest = '\0'; } +static boolean_t +path_is_zvol(const char *inpath) +{ + char buf[MAXPATHLEN]; + + /* Resolve symlinks to /dev/zd* device */ + if (realpath(inpath, buf) != NULL) + if (strncmp(buf, "/dev/zd", 7) == 0) + return (B_TRUE); + + return (B_FALSE); +} + /* - * Given a full path to a file, translate into a dataset name and a relative - * path within the dataset. 'dataset' must be at least MAXNAMELEN characters, - * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64 - * buffer, which we need later to get the object ID. + * Given a full path to a file or zvol device, translate into a dataset name and + * a relative path within the dataset. 'dataset' must be at least MAXNAMELEN + * characters, and 'relpath' must be at least MAXPATHLEN characters. We also + * pass a stat64 buffer, which we need later to get the object ID. */ static int parse_pathname(const char *inpath, char *dataset, char *relpath, @@ -98,6 +111,47 @@ parse_pathname(const char *inpath, char *dataset, char *relpath, return (-1); } + /* special case: inject errors into zvol */ + if (path_is_zvol(inpath)) { + int fd; + char *slash; + int rc; + if ((fd = open(inpath, O_RDONLY|O_CLOEXEC)) == -1 || + fstat64(fd, statbuf) != 0) { + return (-1); + } + + /* + * HACK: the zvol's inode will not contain its object number. + * However, it has long been the case that the zvol data is + * object number 1: + * + * Object lvl iblk dblk dsize lsize %full type + * 0 6 128K 16K 11K 16K 6.25 DMU dnode + * 1 2 128K 16K 20.1M 20M 100.00 zvol object + * 2 1 128K 512 0 512 100.00 zvol prop + * + * So we hardcode that in the statbuf inode field as workaround. + */ + statbuf->st_ino = 1; + + rc = ioctl(fd, BLKZNAME, fullpath); + close(fd); + if (rc != 0) + return (-1); + + (void) strcpy(dataset, fullpath); + + /* + * fullpath contains string like 'tank/zvol'. Strip off the + * 'tank' and 'zvol' parts. + */ + slash = strchr(fullpath, '/'); + *slash = '\0'; + (void) strcpy(relpath, slash + 1); + return (0); + } + if (getextmntent(fullpath, &mp, statbuf) != 0) { (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", fullpath); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index a6658a9c2800..67abbfa4df40 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1050,6 +1050,9 @@ nice_num_str_nvlist(nvlist_t *item, const char *key, uint64_t value, case ZFS_NICENUM_BYTES: zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_BYTES); break; + case ZFS_NICENUM_RAW: + zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_RAW); + break; case ZFS_NICENUM_TIME: zfs_nicenum_format(value, buf, 256, ZFS_NICENUM_TIME); break; @@ -2590,7 +2593,7 @@ typedef struct status_cbdata { int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; - boolean_t cb_verbose; + int cb_verbosity; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; @@ -3322,7 +3325,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, nvlist_t **child; boolean_t printed = B_FALSE; - assert(zhp != NULL || !cb->cb_verbose); + assert(zhp != NULL || cb->cb_verbosity == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -9516,7 +9519,7 @@ class_vdevs_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, if (!cb->cb_flat_vdevs) class_obj = fnvlist_alloc(); - assert(zhp != NULL || !cb->cb_verbose); + assert(zhp != NULL || cb->cb_verbosity == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -9620,57 +9623,96 @@ spares_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, } } +/* + * Take a uint64 nvpair named 'name' from nverrlist, nicenum-ify it, and + * put it back in 'nverrlist', possibly as a string, with the same 'name'. + */ +static void +convert_nvlist_uint64_to_nicenum(status_cbdata_t *cb, nvlist_t *parent, + const char *name, enum zfs_nicenum_format format) +{ + uint64_t val; + nvpair_t *nvp; + + if (nvlist_lookup_nvpair(parent, name, &nvp) != 0) + return; /* nothing by that name, ignore */ + + val = fnvpair_value_uint64(nvp); + nvlist_remove_nvpair(parent, nvp); + nice_num_str_nvlist(parent, name, val, + cb->cb_literal, cb->cb_json_as_int, format); +} + static void errors_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *item) { - uint64_t nerr; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, - &nerr) == 0) { - nice_num_str_nvlist(item, ZPOOL_CONFIG_ERRCOUNT, nerr, - cb->cb_literal, cb->cb_json_as_int, ZFS_NICENUM_1024); - if (nerr != 0 && cb->cb_verbose) { - nvlist_t *nverrlist = NULL; - if (zpool_get_errlog(zhp, &nverrlist) == 0) { - int i = 0; - int count = 0; - size_t len = MAXPATHLEN * 2; - nvpair_t *elem = NULL; - - for (nvpair_t *pair = - nvlist_next_nvpair(nverrlist, NULL); - pair != NULL; - pair = nvlist_next_nvpair(nverrlist, pair)) - count++; - char **errl = (char **)malloc( - count * sizeof (char *)); - - while ((elem = nvlist_next_nvpair(nverrlist, - elem)) != NULL) { - nvlist_t *nv; - uint64_t dsobj, obj; - - verify(nvpair_value_nvlist(elem, - &nv) == 0); - verify(nvlist_lookup_uint64(nv, - ZPOOL_ERR_DATASET, &dsobj) == 0); - verify(nvlist_lookup_uint64(nv, - ZPOOL_ERR_OBJECT, &obj) == 0); - errl[i] = safe_malloc(len); - zpool_obj_to_path(zhp, dsobj, obj, - errl[i++], len); - } - nvlist_free(nverrlist); - fnvlist_add_string_array(item, "errlist", - (const char **)errl, count); - for (int i = 0; i < count; ++i) - free(errl[i]); - free(errl); - } else - fnvlist_add_string(item, "errlist", - strerror(errno)); + int verbosity = cb->cb_verbosity; + nvlist_t *nverrlist = NULL, *json; + nvpair_t *elem; + char *pathname; + size_t len = MAXPATHLEN * 2; + nvlist_t **ranges; + uint_t count; + + if (zpool_get_errlog(zhp, &nverrlist) != 0) + return; + + pathname = safe_malloc(len); + json = fnvlist_alloc(); + + elem = NULL; + while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { + nvlist_t *nv; + uint64_t dsobj, obj; + + verify(nvpair_value_nvlist(elem, &nv) == 0); + + dsobj = fnvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET); + obj = fnvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT); + + zpool_obj_to_path(zhp, dsobj, obj, pathname, len); + + /* + * Each JSON entry is a different file/zvol. If user has + * verbosity = 1, then just make a simple object containing + * the name. + */ + if (verbosity <= 1) { + nvlist_t *nameonly; + nameonly = fnvlist_alloc(); + fnvlist_add_string(nameonly, ZPOOL_ERR_NAME, pathname); + fnvlist_add_nvlist(json, pathname, nameonly); + nvlist_free(nameonly); + continue; } + + fnvlist_add_string(nv, ZPOOL_ERR_NAME, pathname); + + /* nicenum-ify our nvlist */ + convert_nvlist_uint64_to_nicenum(cb, nv, ZPOOL_ERR_OBJECT, + ZFS_NICENUM_RAW); + convert_nvlist_uint64_to_nicenum(cb, nv, ZPOOL_ERR_DATASET, + ZFS_NICENUM_RAW); + convert_nvlist_uint64_to_nicenum(cb, nv, ZPOOL_ERR_BLOCK_SIZE, + ZFS_NICENUM_1024); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_ERR_RANGES, &ranges, + &count) == 0) { + for (uint_t i = 0; i < count; i++) { + convert_nvlist_uint64_to_nicenum(cb, ranges[i], + ZPOOL_ERR_START_BYTE, ZFS_NICENUM_1024); + convert_nvlist_uint64_to_nicenum(cb, ranges[i], + ZPOOL_ERR_END_BYTE, ZFS_NICENUM_1024); + } + } + + fnvlist_add_nvlist(json, pathname, nv); } + + /* Place our error list in a top level "errors" JSON object. */ + fnvlist_add_nvlist(item, ZPOOL_ERR_JSON, json); + free(pathname); + nvlist_free(nverrlist); } static void @@ -10341,13 +10383,15 @@ print_checkpoint_status(pool_checkpoint_stat_t *pcs) space_buf); } + static void -print_error_log(zpool_handle_t *zhp) +print_error_log(zpool_handle_t *zhp, int verbosity, boolean_t literal) { nvlist_t *nverrlist = NULL; nvpair_t *elem; - char *pathname; + char *pathname, *last_pathname = NULL; size_t len = MAXPATHLEN * 2; + boolean_t started = B_FALSE; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; @@ -10367,9 +10411,49 @@ print_error_log(zpool_handle_t *zhp) verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); - (void) printf("%7s %s\n", "", pathname); + if (last_pathname == NULL || + 0 != strncmp(pathname, last_pathname, len)) { + last_pathname = strdup(pathname); + if (started) + (void) printf("\n"); + else + started = B_TRUE; + (void) printf("%7s %s ", "", pathname); + } else if (verbosity > 1) { + (void) printf(","); + } + if (verbosity > 1) { + nvlist_t **arr; + uint_t count; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_ERR_RANGES, + &arr, &count) != 0) { + printf("(no ranges)"); + continue; + } + + for (uint_t i = 0; i < count; i++) { + uint64_t start; + uint64_t end; + start = fnvlist_lookup_uint64(arr[i], + ZPOOL_ERR_START_BYTE); + end = fnvlist_lookup_uint64(arr[i], + ZPOOL_ERR_END_BYTE); + if (literal) { + (void) printf("%lu-%lu", start, end); + } else { + char s1[32], s2[32]; + zfs_nicenum(start, s1, sizeof (s1)); + zfs_nicenum(end, s2, sizeof (s2)); + (void) printf("%s-%s", s1, s2); + } + if (i != count - 1) + printf(","); + } + } } + (void) printf("\n"); free(pathname); + free(last_pathname); nvlist_free(nverrlist); } @@ -11065,14 +11149,15 @@ status_callback(zpool_handle_t *zhp, void *data) if (nerr == 0) { (void) printf(gettext( "errors: No known data errors\n")); - } else if (!cbp->cb_verbose) { + } else if (0 == cbp->cb_verbosity) { color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); color_end(); } else { - print_error_log(zhp); + print_error_log(zhp, cbp->cb_verbosity, + cbp->cb_literal); } } @@ -11199,7 +11284,7 @@ zpool_do_status(int argc, char **argv) get_timestamp_arg(*optarg); break; case 'v': - cb.cb_verbose = B_TRUE; + cb.cb_verbosity++; break; case 'j': cb.cb_json = B_TRUE; diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c index ff2597ef65ef..ff6fa3ac856e 100644 --- a/cmd/zpool/zpool_util.c +++ b/cmd/zpool/zpool_util.c @@ -115,19 +115,6 @@ array64_max(uint64_t array[], unsigned int len) return (max); } -/* - * Find highest one bit set. - * Returns bit number + 1 of highest bit that is set, otherwise returns 0. - */ -int -highbit64(uint64_t i) -{ - if (i == 0) - return (0); - - return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); -} - /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 3af23c52bd45..98105995eaa5 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -45,7 +45,6 @@ void *safe_realloc(void *, size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); uint64_t array64_max(uint64_t array[], unsigned int len); -int highbit64(uint64_t i); int lowbit64(uint64_t i); /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 662fd81c5ee1..238fcffcbf86 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1728,6 +1728,24 @@ typedef enum { #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" +#define ZPOOL_ERR_LEVEL "level" +#define ZPOOL_ERR_BLKID "blkid" /* NOT NEEDED */ + +/* Additional nvpairs from zpool_get_errlog() nvlist */ +#define ZPOOL_ERR_BLOCK_SIZE "block_size" +#define ZPOOL_ERR_OBJECT_TYPE "object_type" +#define ZPOOL_ERR_RANGES "ranges" +#define ZPOOL_ERR_START_BYTE "start_byte" +#define ZPOOL_ERR_END_BYTE "end_byte" +#define ZPOOL_ERR_NAME "name" + +/* + * For the zpool status JSON output, we collect all the error lists and put + * them in a seperate top level element so they're easier to iterate over. + * That way the error lists don't get interspersed with the zpool status + * objects. + */ +#define ZPOOL_ERR_JSON "errors" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) diff --git a/include/sys/spa.h b/include/sys/spa.h index f172f2af6f07..2f073670cc4f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -362,6 +362,29 @@ typedef enum bp_embedded_type { #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ +/* + * Get number of data block pointers an indirect block could point to (given + * the block level and block size shift). + * + * For example, an L1 block with a blocksize of 128kb could point to: + * + * BP_SPANB(17, 1) = 1024 L0 block pointers + */ +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) + +/* + * Helper function to lookup the byte range coverered by a block pointer of any + * level (L0, L1, L2 ... etc). + * + * For example if you have an L1 block, and your blocksize is 128kb (shift 17), + * then your block can cover this many bytes: + * + * BP_BYTE_RANGE(17, 1) = 134217728 bytes + */ +#define BP_BYTE_RANGE(indblkshift, level) \ + (BP_SPANB(indblkshift, level) * ((uint64_t)1 << indblkshift)) + /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads diff --git a/include/sys/zfs_stat.h b/include/sys/zfs_stat.h index 7079adaa2fa3..2bd391ba9ce6 100644 --- a/include/sys/zfs_stat.h +++ b/include/sys/zfs_stat.h @@ -48,7 +48,26 @@ typedef struct zfs_stat { } zfs_stat_t; extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, - char *buf, int len); + char *buf, int len, nvlist_t *nv); + +/* + * The legacy behavior of ZFS_IOC_OBJ_TO_STATS is return a zfs_stat_t stuct. + * However, if the user passes in a nvlist dst buffer, we also return + * "extended" object stats. Currently, these extended stats are handpicked + * fields from dmu_object_info_t, but they could be expanded to include + * anything. + */ +#define ZFS_OBJ_STAT_DATA_BLOCK_SIZE "data_block_size" +#define ZFS_OBJ_STAT_METADATA_BLOCK_SIZE "metadata_block_size" +#define ZFS_OBJ_STAT_DNODE_SIZE "dnode_size" +#define ZFS_OBJ_STAT_TYPE "type" +#define ZFS_OBJ_STAT_BONUS_TYPE "bonus_type" +#define ZFS_OBJ_STAT_BONUS_SIZE "bonus_size" +#define ZFS_OBJ_STAT_CHECKSUM "checksum" +#define ZFS_OBJ_STAT_COMPRESS "compress" +#define ZFS_OBJ_STAT_PHYSICAL_BLOCKS_512 "physical_blocks_512" +#define ZFS_OBJ_STAT_MAX_OFFSET "max_offset" +#define ZFS_OBJ_STAT_FILL_COUNT "fill_count" #ifdef __cplusplus } diff --git a/include/zfs_comutil.h b/include/zfs_comutil.h index 90be86c9556b..6690314006c7 100644 --- a/include/zfs_comutil.h +++ b/include/zfs_comutil.h @@ -43,6 +43,8 @@ _ZFS_COMUTIL_H int zfs_spa_version_map(int zpl_version); _ZFS_COMUTIL_H boolean_t zfs_dataset_name_hidden(const char *); +_ZFS_COMUTIL_H int highbit64(uint64_t i); + #define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 _ZFS_COMUTIL_H const char *const zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 5f8963dccd1a..cb25a5920a15 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -34,7 +34,9 @@ dist_libzfs_la_SOURCES += \ endif nodist_libzfs_la_SOURCES = \ + module/zcommon/btree.c \ module/zcommon/cityhash.c \ + module/zcommon/range_tree.c \ module/zcommon/zfeature_common.c \ module/zcommon/zfs_comutil.c \ module/zcommon/zfs_deleg.c \ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f988d27a286a..c6a0740a442d 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -7012,6 +7012,7 @@ + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ce154ae1a4cd..96725a5642f5 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -44,11 +44,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -60,6 +63,11 @@ #include "zfeature_common.h" static boolean_t zpool_vdev_is_interior(const char *name); +static nvlist_t *zpool_get_extended_objset_stat(zpool_handle_t *zhp, + uint64_t dsobj, uint64_t obj); + +static nvlist_t * +zpool_get_extended_obj_stat(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj); typedef struct prop_flags { unsigned int create:1; /* Validate property on creation */ @@ -4613,6 +4621,182 @@ zpool_add_propname(zpool_handle_t *zhp, const char *propname) zhp->zpool_n_propnames++; } +/* + * Given an properties nvlist like: + * + * refreservation: + * source: 'tank/vol' + * value: 1092616192 + * recordsize: + * value: 4096 + * source: 'tank' + * refcompressratio: + * value: 100 + * logicalreferenced: + * value: 1076408320 + * compressratio: + * value: 100 + * ... + * + * Lookup the 'value' field for a uint64_t and return it into *val. For + * example, if you pass "recordsize" for the name, it will store 4096 into *val. + */ +static int +zpool_get_from_prop_nvlist_uint64(nvlist_t *nv, const char *name, uint64_t *val) +{ + nvlist_t *tmp; + int rc; + + rc = nvlist_lookup_nvlist(nv, name, &tmp); + if (rc != 0) + return (rc); + + return (nvlist_lookup_uint64(tmp, "value", val)); +} + +static int +zpool_get_extended_obj_stat_helper(zpool_handle_t *zhp, uint64_t dsobj, + uint64_t obj, uint64_t *data_block_size, const char **type_str) +{ + nvlist_t *nv; + boolean_t is_zvol = B_FALSE; + uint64_t val; + int rc; + nv = zpool_get_extended_obj_stat(zhp, dsobj, obj); + if (nv == NULL) { + nv = zpool_get_extended_objset_stat(zhp, dsobj, obj); + if (nv == NULL) { + return (-1); + } + is_zvol = B_TRUE; + } + if (is_zvol) { + rc = zpool_get_from_prop_nvlist_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &val); + } else { + uint32_t val32 = 0; + rc = nvlist_lookup_uint32(nv, ZFS_OBJ_STAT_DATA_BLOCK_SIZE, + &val32); + val = val32; + } + + if (rc != 0) { + dump_nvlist(nv, 4); + + nvlist_free(nv); + return (rc); + } + *data_block_size = val; + + if (is_zvol) { + *type_str = "zvol"; + } else { + uint64_t type; + type = fnvlist_lookup_uint64(nv, ZFS_OBJ_STAT_TYPE); + if (type >= ARRAY_SIZE(dmu_ot)) + *type_str = "(unknown)"; + else + *type_str = dmu_ot[type].ot_name; + } + return (0); +} + +static void zpool_get_errlog_process_file_cb(void *arg, uint64_t start, + uint64_t size) { + nvlist_t ***tmp = arg; + nvlist_t **nva_next = *tmp; + nvlist_t *nv; + + nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_ERR_START_BYTE, start); + fnvlist_add_uint64(nv, ZPOOL_ERR_END_BYTE, start + size - 1); + *nva_next = nv; + + /* Advance to next array entry */ + *tmp = (void *) nva_next + sizeof (nvlist_t *); +} + +static void +zpool_get_errlog_process_file(zpool_handle_t *zhp, + nvlist_t **nverrlistp, zbookmark_phys_t *zb_start, zbookmark_phys_t *zb_end) +{ + uint64_t data_block_size = 0; + const char *type_str = NULL; + nvlist_t *nv, **nva, **nva_next; + uint64_t count = 0; + + nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb_start->zb_objset); + fnvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb_start->zb_object); + + if (zpool_get_extended_obj_stat_helper(zhp, zb_start->zb_objset, + zb_start->zb_object, &data_block_size, &type_str) != 0) { + /* + * If the kernel supports extended stats, then include them. + * If not, it's still OK. + */ + goto end; + } + + fnvlist_add_string(nv, ZPOOL_ERR_OBJECT_TYPE, type_str); + fnvlist_add_uint64(nv, ZPOOL_ERR_BLOCK_SIZE, data_block_size); + + zfs_range_tree_t *range_tree; + zfs_btree_init(); + range_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + if (!range_tree) { + zfs_btree_fini(); + goto end; + } + + do { + int data_block_size_shift; + uint64_t data_block_range; + + /* + * If an L1 (or higher block) is damaged, it will affect a much + * larger byte range than a simple L0 block. Calculate these + * ranges correctly. + */ + data_block_size_shift = highbit64(data_block_size) - 1; + data_block_range = BP_BYTE_RANGE(data_block_size_shift, + zb_start->zb_level); + + zfs_range_tree_add(range_tree, + zb_start->zb_blkid * data_block_range, data_block_range); + + if (zb_start == zb_end) + break; + } while (zb_start++); + + /* + * Our range tree has all our ranges. Construct an array of start/end + * entries. + */ + count = zfs_range_tree_numsegs(range_tree); + + nva = zfs_alloc(zhp->zpool_hdl, sizeof (nvlist_t *) * count); + nva_next = &nva[0]; + + zfs_range_tree_walk(range_tree, zpool_get_errlog_process_file_cb, + &nva_next); + + zfs_range_tree_vacate(range_tree, NULL, NULL); + zfs_range_tree_destroy(range_tree); + zfs_btree_fini(); + + fnvlist_add_nvlist_array(nv, ZPOOL_ERR_RANGES, (const nvlist_t **) nva, + count); + + for (uint64_t i = 0; i < count; i++) + nvlist_free(nva[i]); + free(nva); + +end: + fnvlist_add_nvlist(*nverrlistp, "ejk", nv); + nvlist_free(nv); +} + /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. @@ -4624,6 +4808,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) libzfs_handle_t *hdl = zhp->zpool_hdl; zbookmark_phys_t *buf; uint64_t buflen = 10000; /* approx. 1MB of RAM */ + uint64_t i; if (fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT) == 0) @@ -4671,39 +4856,35 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ - for (uint64_t i = 0; i < zblen; i++) { - nvlist_t *nv; + zbookmark_phys_t *start = NULL; - /* ignoring zb_blkid and zb_level for now */ - if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && - zb[i-1].zb_object == zb[i].zb_object) + for (i = 0; i < zblen; i++) { + if (start == NULL) { + start = &zb[i]; continue; - - if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) - goto nomem; - if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, - zb[i].zb_objset) != 0) { - nvlist_free(nv); - goto nomem; } - if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, - zb[i].zb_object) != 0) { - nvlist_free(nv); - goto nomem; - } - if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { - nvlist_free(nv); - goto nomem; + + /* filter out duplicate files and levels */ + if (zb[i-1].zb_objset == zb[i].zb_objset && + zb[i-1].zb_object == zb[i].zb_object) { + /* same file, new error block */ + continue; + } else { + /* + * Every time we see a new object, process the + * previous one. + */ + zpool_get_errlog_process_file(zhp, nverrlistp, + start, &zb[i-1]); + start = &zb[i]; } - nvlist_free(nv); } + /* Process the last entry */ + zpool_get_errlog_process_file(zhp, nverrlistp, start, &zb[i-1]); free(buf); - return (0); -nomem: - free(buf); - return (no_memory(zhp->zpool_hdl)); + return (0); } /* @@ -4981,6 +5162,69 @@ zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) return (error); } +/* + * Return extended information about an object. This calls the "extended" + * variant of ZFS_IOC_OBJ_TO_STATS to return things things like block size, + * dmu type, dnone size, etc (see dmu_object_info_t). + * + * Returned nvlist must be freed by the user when they are done with it. + */ +static nvlist_t * +zpool_get_extended_obj_stat_impl(zpool_handle_t *zhp, uint64_t dsobj, + uint64_t obj, enum zfs_ioc stats_ioctl) +{ + zfs_cmd_t zc = {"\0"}; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nv = NULL; + int error; + + /* get the dataset's name */ + zc.zc_obj = dsobj; + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + error = ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_DSOBJ_TO_DSNAME, &zc); + if (error) + return (NULL); + + (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + + zcmd_alloc_dst_nvlist(zhp->zpool_hdl, &zc, 1024); + zc.zc_obj = obj; + while (ioctl(zhp->zpool_hdl->libzfs_fd, stats_ioctl, &zc) != 0) { + if (errno == ENOMEM) { + zcmd_expand_dst_nvlist(zhp->zpool_hdl, &zc); + } else { + return (NULL); + } + } + + zcmd_read_dst_nvlist(zhp->zpool_hdl, &zc, &nv); + + return (nv); +} + +/* + * Return extended information about an object. This calls the "extended" + * variant of ZFS_IOC_OBJ_TO_STATS to return things things like block size, + * dmu type, dnone size, etc (see dmu_object_info_t). + * + * Returned nvlist must be freed by the user when they are done with it. + */ +static nvlist_t * +zpool_get_extended_obj_stat(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj) +{ + return (zpool_get_extended_obj_stat_impl(zhp, dsobj, obj, + ZFS_IOC_OBJ_TO_STATS)); +} + +static nvlist_t * +zpool_get_extended_objset_stat(zpool_handle_t *zhp, uint64_t dsobj, + uint64_t obj) +{ + return (zpool_get_extended_obj_stat_impl(zhp, dsobj, obj, + ZFS_IOC_OBJSET_STATS)); +} + static void zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len, boolean_t always_unmounted) @@ -5029,6 +5273,7 @@ zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); } + free(mntpnt); } diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index aeacc595b363..8fbd5442069a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -46,7 +46,9 @@ nodist_libzpool_la_SOURCES = \ \ module/os/linux/zfs/zio_crypt.c \ \ + module/zcommon/btree.c \ module/zcommon/cityhash.c \ + module/zcommon/range_tree.c \ module/zcommon/simd_stat.c \ module/zcommon/zfeature_common.c \ module/zcommon/zfs_comutil.c \ @@ -73,7 +75,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/bpobj.c \ module/zfs/bptree.c \ module/zfs/bqueue.c \ - module/zfs/btree.c \ module/zfs/brt.c \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ @@ -118,7 +119,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/multilist.c \ module/zfs/objlist.c \ module/zfs/pathname.c \ - module/zfs/range_tree.c \ module/zfs/refcount.c \ module/zfs/rrwlock.c \ module/zfs/sa.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 70eba5099119..210c6c17e729 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -743,20 +743,6 @@ delay(clock_t ticks) (void) poll(0, 0, ticks * (1000 / hz)); } -/* - * Find highest one bit set. - * Returns bit number + 1 of highest bit that is set, otherwise returns 0. - * The __builtin_clzll() function is supported by both GCC and Clang. - */ -int -highbit64(uint64_t i) -{ - if (i == 0) - return (0); - - return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); -} - /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 704f6a7accd8..af80bdff577e 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -174,9 +174,10 @@ Panic inside the specified function. .Op Fl l Ar level .Op Fl r Ar range .Op Fl amq -.Ar path +.Ar path|zvol .Xc -Force an error into the contents of a file. +Force an error into the contents of a file or zvol. +For zvols, pass the path to the zvol block device. . .It Xo .Nm zinject diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 108a1067b384..410604a662f1 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 20, 2025 +.Dd July 1, 2025 .Dt ZPOOL-STATUS 8 .Os . @@ -156,7 +156,9 @@ See Display vdev TRIM status. .It Fl v Displays verbose data error information, printing out a complete list of all -data errors since the last complete pool scrub. +files containing data errors since the last complete pool scrub. +Specified twice, prints out the complete list of all corrupt records within +each corrupt file. If the head_errlog feature is enabled and files containing errors have been removed then the respective filenames will not be reported in subsequent runs of this command. diff --git a/module/Kbuild.in b/module/Kbuild.in index 58a80dc4402c..c13e9706cad9 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -43,7 +43,7 @@ ifeq ($(CONFIG_X86),y) ifeq ($(CONFIG_CC_IS_CLANG),y) CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false -CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false +CFLAGS_zcommon/range_tree.o += -mllvm -x86-cmov-converter=false CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false endif endif @@ -223,6 +223,7 @@ zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS)) ZCOMMON_OBJS := \ + btree.o \ cityhash.o \ simd_stat.o \ zfeature_common.o \ @@ -233,6 +234,7 @@ ZCOMMON_OBJS := \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ + range_tree.o \ zfs_valstr.o \ zpool_prop.o \ zprop_common.o @@ -311,7 +313,6 @@ ZFS_OBJS := \ bptree.o \ bqueue.o \ brt.o \ - btree.o \ dataset_kstats.o \ dbuf.o \ dbuf_stats.o \ @@ -356,7 +357,6 @@ ZFS_OBJS := \ multilist.o \ objlist.o \ pathname.o \ - range_tree.o \ refcount.o \ rrwlock.o \ sa.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 3ba38c43f25b..586c83aefcf4 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -246,6 +246,7 @@ SRCS+= cityhash.c \ zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ + range_tree.c \ zfs_valstr.c \ zpool_prop.c \ zprop_common.c @@ -305,7 +306,6 @@ SRCS+= abd.c \ multilist.c \ objlist.c \ pathname.c \ - range_tree.c \ refcount.c \ rrwlock.c \ sa.c \ diff --git a/module/zfs/btree.c b/module/zcommon/btree.c similarity index 99% rename from module/zfs/btree.c rename to module/zcommon/btree.c index 725b96a3b2c7..59b29585a9e8 100644 --- a/module/zfs/btree.c +++ b/module/zcommon/btree.c @@ -21,6 +21,12 @@ #include #include +#ifndef _KERNEL +#include +#include +#define panic(...) do {printf(__VA_ARGS__); exit(EXIT_FAILURE); } while (0) +#endif + kmem_cache_t *zfs_btree_leaf_cache; /* diff --git a/module/zfs/range_tree.c b/module/zcommon/range_tree.c similarity index 99% rename from module/zfs/range_tree.c rename to module/zcommon/range_tree.c index d73195f1a21f..62e2410afb85 100644 --- a/module/zfs/range_tree.c +++ b/module/zcommon/range_tree.c @@ -35,6 +35,16 @@ #include #include +#ifndef _KERNEL +#include +#include +#define panic(...) do {printf(__VA_ARGS__); exit(EXIT_FAILURE); } while (0) +#define zfs_panic_recover(...) do {printf(__VA_ARGS__); } while (0) + +#undef zfs_dbgmsg +#define zfs_dbgmsg(...) do {} while (0) +#endif + /* * Range trees are tree-based data structures that can be used to * track free space or generally any space allocation information. diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c index 85eee4f21c3e..3c5d320106b2 100644 --- a/module/zcommon/zfs_comutil.c +++ b/module/zcommon/zfs_comutil.c @@ -39,6 +39,7 @@ #include #include "zfs_comutil.h" #include +#include /* * Are there allocatable vdevs? @@ -243,6 +244,83 @@ zfs_dataset_name_hidden(const char *name) return (B_FALSE); } +/* + * highbit64 is defined in sysmacros.h for the kernel side. However, we need + * it on the libzfs side and zpool_main.c side, and there's no good place to + * put it but here. + */ +#ifndef highbit64 +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} +#endif + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, + {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, + {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, + {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, + {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, + {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, + {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } +}; + #if defined(_KERNEL) EXPORT_SYMBOL(zfs_allocatable_devs); EXPORT_SYMBOL(zfs_special_devs); @@ -251,4 +329,5 @@ EXPORT_SYMBOL(zfs_zpl_version_map); EXPORT_SYMBOL(zfs_spa_version_map); EXPORT_SYMBOL(zfs_history_event_names); EXPORT_SYMBOL(zfs_dataset_name_hidden); +EXPORT_SYMBOL(dmu_ot); #endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index a7a5c89bdafb..636abb29ad73 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -102,63 +102,6 @@ uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; */ uint_t dmu_ddt_copies = 0; -const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, - {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, - {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, - {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, - {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, - {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, - {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, - {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, - {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, - {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, - {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, - {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, - {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, - {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, - {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, - {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, - {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, - {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, - {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, - {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, - {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, - {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, - {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } -}; - dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, @@ -2974,7 +2917,6 @@ EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); -EXPORT_SYMBOL(dmu_ot); ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 5ca7c2320c4e..a827abb0cb0c 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -284,6 +284,14 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); +/* + * Return true if zfs_cmd_t has a destination nvlist. + */ +static inline boolean_t zfs_cmd_has_dst_nvlist(zfs_cmd_t *zc) +{ + return (zc->zc_nvlist_dst != 0 && zc->zc_nvlist_dst_size != 0); +} + static void history_str_free(char *buf) { @@ -1929,12 +1937,22 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) * outputs: * zc_stat stats on object * zc_value path to object + * + * ------------------------- + * Optional extended stats + * ------------------------- + * The legacy behavior of ZFS_IOC_OBJ_TO_STATS is return a zfs_stat_t stuct. + * However, if the user passes in a nvlist dst buffer, we also return + * "extended" object stats. Currently, these extended stats are handpicked + * fields from dmu_object_info_t, but they could be expanded to include + * anything. See the ZFS_OBJ_STAT_* macros for the current list. */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; + nvlist_t *nv; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, @@ -1944,10 +1962,23 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc) dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } + + if (zfs_cmd_has_dst_nvlist(zc)) { + VERIFY0(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); + } else + nv = NULL; + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, - sizeof (zc->zc_value)); + sizeof (zc->zc_value), nv); dmu_objset_rele_flags(os, B_TRUE, FTAG); + if (nv != NULL) { + error = put_nvlist(zc, nv); + if (error != 0) { + zfs_dbgmsg("cant put nvlist, err %d", error); + } + } + return (error); } diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 861783d79a82..02766284c434 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -152,7 +152,7 @@ zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, * Given an object number, return some zpl level statistics */ static int -zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, +zfs_obj_to_stats_legacy(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; @@ -280,9 +280,36 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) return (error); } +static void +zfs_obj_to_stats_extended(sa_handle_t *hdl, nvlist_t *nv) +{ + dmu_object_info_t doi; + sa_object_info(hdl, &doi); + VERIFY0(nvlist_add_uint32(nv, ZFS_OBJ_STAT_DATA_BLOCK_SIZE, + doi.doi_data_block_size)); + VERIFY0(nvlist_add_uint32(nv, ZFS_OBJ_STAT_METADATA_BLOCK_SIZE, + doi.doi_metadata_block_size)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_TYPE, + doi.doi_type)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_BONUS_TYPE, + doi.doi_bonus_type)); + VERIFY0(nvlist_add_uint8(nv, ZFS_OBJ_STAT_CHECKSUM, + doi.doi_checksum)); + VERIFY0(nvlist_add_uint8(nv, ZFS_OBJ_STAT_COMPRESS, + doi.doi_compress)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_DNODE_SIZE, + doi.doi_dnodesize)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_PHYSICAL_BLOCKS_512, + doi.doi_physical_blocks_512)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_MAX_OFFSET, + doi.doi_max_offset)); + VERIFY0(nvlist_add_uint64(nv, ZFS_OBJ_STAT_FILL_COUNT, + doi.doi_fill_count)); +} + int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, - char *buf, int len) + char *buf, int len, nvlist_t *nv) { char *path = buf + len - 1; sa_attr_type_t *sa_table; @@ -300,12 +327,16 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, if (error != 0) return (error); - error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + error = zfs_obj_to_stats_legacy(hdl, sa_table, sb); + if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } + if (nv != NULL) + zfs_obj_to_stats_extended(hdl, nv); + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index aeea58bedfe4..816d40301cf6 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -88,8 +88,6 @@ static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); -#define BP_SPANB(indblkshift, level) \ - (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9f531411fbe1..9c0fc0605cd0 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -577,7 +577,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_-v'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 678c01b58f94..464ce22b382d 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1299,6 +1299,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_-v.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-v.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-v.ksh new file mode 100755 index 000000000000..7590e6c5983a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_-v.ksh @@ -0,0 +1,181 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2025 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify error log ranges from 'zpool status -vv' after corrupting a file +# +# STRATEGY: +# 1. Create files with different record sizes +# 2. Create a zvol +# 3. zinject errors into files, zvol, and MOS. Inject at different byte ranges. +# 4. Verify error reporting in zpool status with different flags + +verify_runnable "both" + +log_assert "Verify zpool status prints error log ranges" + +# Given a list of lines in $1, look for each line somewhere in stdin (the +# zpool status output). +function check_status +{ + # Read stdin + out="$(cat -)" + + lines="$1" + while IFS= read -r line; do + if ! echo "$out" | grep -Fq "$line" ; then + log_fail "Didn't see '$line' string in: '$out'" + fi + done <<< "$lines" + log_note "Successfully saw '$lines'" +} + +function cleanup +{ + log_must zinject -c all + log_must zfs destroy $TESTPOOL/4k + log_must zfs destroy $TESTPOOL/1m + log_must zfs destroy $TESTPOOL/$TESTVOL +} +log_onexit cleanup + +log_must zfs set compression=off $TESTPOOL +log_must zfs create -o recordsize=4k $TESTPOOL/4k +log_must zfs create -o recordsize=1M $TESTPOOL/1m +log_must mkfile 1m /$TESTPOOL/4k/4k_file1 +log_must mkfile 1m /$TESTPOOL/4k/4k_file2 +log_must mkfile 10m /$TESTPOOL/1m/1m_file + +export TESTVOL=testvol +log_must zfs create -V 100M -o compression=off -o volblocksize=4k $TESTPOOL/$TESTVOL +log_must mkfile 10m ${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# Corrupt the MOS. We do two scrubs here since the MOS error doesn't always +# show up after the first scrub for some reason. +log_must zinject -t mosdir $TESTPOOL +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_must zinject -t data -e checksum -f 100 /$TESTPOOL/4k/4k_file1 +log_must zinject -t data -e checksum -f 100 /$TESTPOOL/4k/4k_file2 +log_must zinject -t data -e checksum -f 100 /$TESTPOOL/1m/1m_file +log_must zinject -t data -e checksum -f 100 ${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL + +# Try to read first 10 blocks of '4k_file1'. The read should fail. +log_mustnot dd conv=fsync if=/$TESTPOOL/4k/4k_file1 of=/dev/null bs=4k count=10 +log_must zpool sync + +# Try to read blocks 0 and blocks 4-5 and 6-7 on '4k_file2' to create multiple +# ranges. The read should fail. +log_mustnot dd conv=fsync if=/$TESTPOOL/4k/4k_file2 of=/dev/null bs=4k count=1 skip=0 +log_must zpool sync +log_mustnot dd conv=fsync if=/$TESTPOOL/4k/4k_file2 of=/dev/null bs=4k count=2 skip=4 +log_must zpool sync +log_mustnot dd conv=fsync if=/$TESTPOOL/4k/4k_file2 of=/dev/null bs=4k count=2 skip=6 +log_must zpool sync + +# Try to read the 2nd megabyte of '1m_file' +log_mustnot dd conv=fsync if=/$TESTPOOL/1m/1m_file of=/dev/null bs=1M skip=1 count=1 +log_must zpool sync + + +# Try to read some ranges of the zvol. +# +# NOTE: for whatever reason, reading from the 1st megabyte of the zvol, with +# any block size, will not produce an error. If you read past the 1st megabyte +# it will. +log_mustnot dd conv=fsync if=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL of=/dev/null bs=4k count=1 skip=1000 +log_must zpool sync + +log_mustnot dd conv=fsync if=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL of=/dev/null bs=4k count=3 skip=1100 +log_must zpool sync + +log_must zinject -c all + +log_must zpool status -vv + +# Look for each these lines somewhere in the zpool status output +val=$(cat << END +:<0x1> (no ranges) +/testpool/4k/4k_file1 0-4.00K +/testpool/4k/4k_file2 0-4.00K,16K-20.0K,24K-28.0K +/testpool/1m/1m_file 1M-2.00M +testpool/testvol:<0x1> 3.91M-3.91M,4.30M-4.30M +END +) +zpool status -vv | check_status "$val" + +val=$(cat << END +:<0x1> (no ranges) +/testpool/4k/4k_file1 0-4095 +/testpool/4k/4k_file2 0-4095,16384-20479,24576-28671 +/testpool/1m/1m_file 1048576-2097151 +testpool/testvol:<0x1> 4096000-4100095,4505600-4509695 +END +) +zpool status -vvp | check_status "$val" + +# Look only at the '.pools.testpool.errors' JSON object for output. We +# remove the .object and .dataset objects since they are non-deterministic +# values. +# +# Look at four variants of the JSON output (-vj -vvj, -vvjp, -vvjp --json-int) +val=$(cat << END +{":<0x1>":{"name":":<0x1>"},"/testpool/4k/4k_file1":{"name":"/testpool/4k/4k_file1"},"/testpool/4k/4k_file2":{"name":"/testpool/4k/4k_file2"},"/testpool/1m/1m_file":{"name":"/testpool/1m/1m_file"},"testpool/testvol:<0x1>":{"name":"testpool/testvol:<0x1>"}} +END +) +zpool status -vj | jq -c '.pools.testpool.errors | del (.[].object,.[].dataset)' | check_status "$val" + +val=$(cat << END +{":<0x1>":{"name":":<0x1>"},"/testpool/4k/4k_file1":{"object_type":"ZFS plain file","ranges":[{"start_byte":"0","end_byte":"4.00K"}],"name":"/testpool/4k/4k_file1","block_size":"4K"},"/testpool/4k/4k_file2":{"object_type":"ZFS plain file","ranges":[{"start_byte":"0","end_byte":"4.00K"},{"start_byte":"16K","end_byte":"20.0K"},{"start_byte":"24K","end_byte":"28.0K"}],"name":"/testpool/4k/4k_file2","block_size":"4K"},"/testpool/1m/1m_file":{"object_type":"ZFS plain file","ranges":[{"start_byte":"1M","end_byte":"2.00M"}],"name":"/testpool/1m/1m_file","block_size":"1M"},"testpool/testvol:<0x1>":{"object_type":"zvol","ranges":[{"start_byte":"3.91M","end_byte":"3.91M"},{"start_byte":"4.30M","end_byte":"4.30M"}],"name":"testpool/testvol:<0x1>","block_size":"4K"}} +END +) +zpool status -vvj | jq -c '.pools.testpool.errors | del (.[].object,.[].dataset)' | check_status "$val" + +val=$(cat << END +{":<0x1>":{"name":":<0x1>"},"/testpool/4k/4k_file1":{"object_type":"ZFS plain file","ranges":[{"start_byte":"0","end_byte":"4095"}],"name":"/testpool/4k/4k_file1","block_size":"4096"},"/testpool/4k/4k_file2":{"object_type":"ZFS plain file","ranges":[{"start_byte":"0","end_byte":"4095"},{"start_byte":"16384","end_byte":"20479"},{"start_byte":"24576","end_byte":"28671"}],"name":"/testpool/4k/4k_file2","block_size":"4096"},"/testpool/1m/1m_file":{"object_type":"ZFS plain file","ranges":[{"start_byte":"1048576","end_byte":"2097151"}],"name":"/testpool/1m/1m_file","block_size":"1048576"},"testpool/testvol:<0x1>":{"object_type":"zvol","ranges":[{"start_byte":"4096000","end_byte":"4100095"},{"start_byte":"4505600","end_byte":"4509695"}],"name":"testpool/testvol:<0x1>","block_size":"4096"}} +END +) +zpool status -vvjp | jq -c '.pools.testpool.errors | del (.[].object,.[].dataset)' | check_status "$val" + +val=$(cat << END +{":<0x1>":{"name":":<0x1>"},"/testpool/4k/4k_file1":{"object_type":"ZFS plain file","ranges":[{"start_byte":0,"end_byte":4095}],"name":"/testpool/4k/4k_file1","block_size":4096},"/testpool/4k/4k_file2":{"object_type":"ZFS plain file","ranges":[{"start_byte":0,"end_byte":4095},{"start_byte":16384,"end_byte":20479},{"start_byte":24576,"end_byte":28671}],"name":"/testpool/4k/4k_file2","block_size":4096},"/testpool/1m/1m_file":{"object_type":"ZFS plain file","ranges":[{"start_byte":1048576,"end_byte":2097151}],"name":"/testpool/1m/1m_file","block_size":1048576},"testpool/testvol:<0x1>":{"object_type":"zvol","ranges":[{"start_byte":4096000,"end_byte":4100095},{"start_byte":4505600,"end_byte":4509695}],"name":"testpool/testvol:<0x1>","block_size":4096}} +END +) +zpool status -vvjp --json-int | jq -c '.pools.testpool.errors | del (.[].object,.[].dataset)' | check_status "$val" + +# Clear the error log from our pool +log_must zpool scrub $TESTPOOL +log_must zpool clear $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_pass "zpool status error log output is correct"