diff --git a/.mailmap b/.mailmap deleted file mode 100644 index e8e71435ca9..00000000000 --- a/.mailmap +++ /dev/null @@ -1,113 +0,0 @@ -# This file exists to help consolidate names and email addresses -# (e.g., when people accidentally commit with an incorrect or local -# email address). Two common use cases: -# -# 1. Consolidate multiple email addresses from a single person. -# Example: one commit from John Smith is from -# and another is from -# , and a third is from -# . But they're all from -# the same John Smith person. -# -# 2. Consolidate misspellings / altername names from a single person. -# Example: one commit is from "John Smith" and another is from -# "John Smith, CONTRACTOR", and third is from "RocketMan 9982". But -# these are all really the same person, who can be listed once in -# AUTHORS as "John Smith". -# -# The format of this file is documented in git-shortlog(1). Specifically, -# a line like this: -# -# Proper Name -# -# means that when git sees "commit@email.xx" it will display -# "Proper Name " instead in certain circumstances. Those -# circumstances include: -# -# - git shortlog -# - git blame -# - git log --format=tformat:"%aN <%aE>" (and similar) -# - -Jeff Squyres -Jeff Squyres --quiet <--quiet> -Jeff Squyres - -George Bosilca - -Howard Pritchard -Howard Pritchard - -Andrew Friedley - -Devendar Bureddy - -Edgar Gabriel -Edgar Gabriel - -Gilles Gouaillardet - -Matias A Cabral -Matias A Cabral - -Pavel Shamis -Pavel Shamis -Pavel Shamis - -Todd Kordenbrock - -Yohann Burette -Yohann Burette - -MPI Team (bot) -MPI Team (bot) -MPI Team (bot) - -Yossi Itigin - -Josh Hursey -Josh Hursey - -Adrian Reber - -Elena Elkina -Elena Elkina - -Igor Ivanov -Igor Ivanov - -Mangala Jyothi Bhaskar -Mangala Jyothi Bhaskar - -Ralph Castain -Ralph Castain - -Rolf vandeVaart - -Karol Mroz - -Nadezhda Kogteva - -Thananon Patinyasakdikul - -Nysal Jan K A -Nysal Jan K A - -Zhi Ming Wang - -Annapurna Dasari - -L. R. Rajeshnarayanan - -Aurelien Bouteiller -Aurelien Bouteiller - -Alex Mikheev - -Thomas Naughton - -Geoffrey Paulsen - -Anandhi S Jayakumar - -Mohan Gandhi diff --git a/ompi/mca/io/romio321/romio/adio/Makefile.mk b/ompi/mca/io/romio321/romio/adio/Makefile.mk index ffc05cb4151..8255680282a 100644 --- a/ompi/mca/io/romio321/romio/adio/Makefile.mk +++ b/ompi/mca/io/romio321/romio/adio/Makefile.mk @@ -22,7 +22,8 @@ noinst_HEADERS += \ adio/include/nopackage.h \ adio/include/romioconf-undefs.h \ adio/include/mpiu_external32.h \ - adio/include/hint_fns.h + adio/include/hint_fns.h \ + adio/include/ad_env.h include $(top_srcdir)/adio/ad_gpfs/Makefile.mk include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk @@ -43,5 +44,6 @@ include $(top_srcdir)/adio/ad_testfs/Makefile.mk include $(top_srcdir)/adio/ad_ufs/Makefile.mk include $(top_srcdir)/adio/ad_xfs/Makefile.mk include $(top_srcdir)/adio/ad_zoidfs/Makefile.mk +include $(top_srcdir)/adio/ad_oceanfs/Makefile.mk include $(top_srcdir)/adio/common/Makefile.mk diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/Makefile.mk b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/Makefile.mk new file mode 100644 index 00000000000..75eebf198ac --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/Makefile.mk @@ -0,0 +1,39 @@ +## -*- Mode: Makefile; -*- +## vim: set ft=automake : +## +## (C) 2017 by DataDirect Networks +## See COPYRIGHT in top-level directory. +## + +if BUILD_AD_OCEANFS + +noinst_HEADERS += adio/ad_oceanfs/ad_oceanfs.h \ + adio/ad_oceanfs/ad_oceanfs_common.h \ + adio/ad_oceanfs/ad_oceanfs_file.h \ + adio/ad_oceanfs/ad_oceanfs_pub.h \ + adio/ad_oceanfs/ad_oceanfs_tuning.h \ + adio/ad_oceanfs/ad_oceanfs_group_tuning.h \ + adio/ad_oceanfs/ad_oceanfs_aggrs.h \ + adio/ad_oceanfs/mpi_fs_intf.h +romio_other_sources += \ + adio/ad_oceanfs/ad_oceanfs.c \ + adio/ad_oceanfs/ad_oceanfs_close.c \ + adio/ad_oceanfs/ad_oceanfs_common.c \ + adio/ad_oceanfs/ad_oceanfs_fcntl.c \ + adio/ad_oceanfs/ad_oceanfs_io.c \ + adio/ad_oceanfs/ad_oceanfs_open.c \ + adio/ad_oceanfs/ad_oceanfs_file.c \ + adio/ad_oceanfs/ad_oceanfs_pub.c \ + adio/ad_oceanfs/ad_oceanfs_tuning.c \ + adio/ad_oceanfs/ad_oceanfs_group_tuning.c \ + adio/ad_oceanfs/ad_oceanfs_aggrs.c \ + adio/ad_oceanfs/ad_oceanfs_rdstr.c \ + adio/ad_oceanfs/ad_oceanfs_rdcoll.c \ + adio/ad_oceanfs/ad_oceanfs_wrcoll.c \ + adio/ad_oceanfs/ad_oceanfs_resize.c \ + adio/ad_oceanfs/ad_oceanfs_hints.c \ + adio/ad_oceanfs/ad_oceanfs_view.c \ + adio/ad_oceanfs/ad_oceanfs_viewio.c \ + adio/ad_oceanfs/mpi_fs_intf.c + +endif BUILD_AD_OCEANFS diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.c new file mode 100644 index 00000000000..8b01f4d44bc --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.c @@ -0,0 +1,32 @@ +#include "ad_oceanfs.h" +#include "adioi.h" + +struct ADIOI_Fns_struct ADIO_OCEANFS_operations = { + ADIOI_OCEANFS_Open, /* Open */ + ADIOI_GEN_OpenColl, /* OpenColl */ + ADIOI_OCEANFS_ReadContig, /* ReadContig */ + ADIOI_OCEANFS_WriteContig, /* WriteContig */ + ADIOI_OCEANFS_ReadStridedColl, /* ReadStridedColl */ + ADIOI_OCEANFS_WriteStridedColl, /* WriteStridedColl */ + ADIOI_GEN_SeekIndividual, /* SeekIndividual */ + ADIOI_OCEANFS_Fcntl, /* Fcntl */ + ADIOI_OCEANFS_SetInfo, /* SetInfo */ + ADIOI_OCEANFS_ReadStrided, /* ReadStrided */ + ADIOI_GEN_WriteStrided, /* WriteStrided */ + ADIOI_OCEANFS_Close, /* Close */ + ADIOI_FAKE_IreadContig, /* IreadContig */ + ADIOI_FAKE_IwriteContig, /* IwriteContig */ + ADIOI_FAKE_IODone, /* ReadDone */ + ADIOI_FAKE_IODone, /* WriteDone */ + ADIOI_FAKE_IOComplete, /* ReadComplete */ + ADIOI_FAKE_IOComplete, /* WriteComplete */ + ADIOI_FAKE_IreadStrided, /* IreadStrided */ + ADIOI_FAKE_IwriteStrided, /* IwriteStrided */ + ADIOI_GEN_Flush, /* Flush */ + ADIOI_OCEANFS_Resize, /* Resize */ + ADIOI_GEN_Delete, /* Delete */ + ADIOI_GEN_Feature, /* Feature */ + ADIOI_OCEANFS_PREFIX, + ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */ + ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */ +}; diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.h new file mode 100644 index 00000000000..2149bc7588c --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs.h @@ -0,0 +1,48 @@ +#ifndef AD_OCEANFS_H_INCLUDED +#define AD_OCEANFS_H_INCLUDED + +#include "adio.h" +#include "ad_oceanfs_tuning.h" + +#define OCEANFS_READ 0 +#define OCEANFS_WRITE 1 +#define OCEANFS_READ_STRIDED 2 +#define OCEANFS_READ_COLL 3 +#define OCEANFS_WRITE_STRIDED 4 +#define OCEANFS_WRITE_COLL 5 + +#define ADIOI_OCEANFS_PREFIX "oceanfs:" +#define ADIOI_OCEANFS_PREFIX_LEN (sizeof(ADIOI_OCEANFS_PREFIX) - 1) + +void ADIOI_OCEANFS_Open(ADIO_File fd, int *error_code); + +void ADIOI_OCEANFS_Close(ADIO_File fd, int *error_code); + +void ADIOI_OCEANFS_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code); + +void ADIOI_OCEANFS_WriteContig(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code); + +void ADIOI_OCEANFS_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code); + +void ADIOI_OCEANFS_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code); + +void ADIOI_OCEANFS_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code); + +void ADIOI_OCEANFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code); + +void ADIOI_OCEANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); + +void ADIOI_OCEANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); + +int ADIOI_OCEANFS_set_view(ADIO_File fd, int *error_code); + +int ADIOI_OCEANFS_StridedViewIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int io_flag, int *error_code); +int ADIOI_OCEANFS_Set_lock(FDTYPE fd, int cmd, int type, ADIO_Offset offset, int whence, ADIO_Offset len); + +#endif /* AD_OCEANFS_H_INCLUDED */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.c new file mode 100644 index 00000000000..646162b6cfd --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.c @@ -0,0 +1,554 @@ +#include "ad_oceanfs_aggrs.h" +#include "ad_oceanfs_pub.h" +#include "adio.h" +#include "adio_cb_config_list.h" + +#ifdef AGGREGATION_PROFILE +#include "mpe.h" +#endif + +#ifdef USE_DBG_LOGGING +#define AGG_DEBUG 1 +#endif + +#ifndef TRACE_ERR +#define TRACE_ERR(format...) +#endif + +/* Comments copied from common: + * ADIOI_Calc_aggregator + * ADIOI_Calc_file_domains + * ADIOI_Calc_my_req + * ADIOI_Calc_others_req + * The last three of these were originally in ad_read_coll.c, but they are + * also shared with ad_write_coll.c. I felt that they were better kept with + * the rest of the shared aggregation code. + * ADIO_Offset st_offsets[0..nprocs-1] + * ADIO_Offset end_offsets[0..nprocs-1] + * These contain a list of start and end offsets for each process in + * the communicator. For example, an access at loc 10, size 10 would + * have a start offset of 10 and end offset of 19. + * int nprocs + * number of processors in the collective I/O communicator + * ADIO_Offset min_st_offset + * ADIO_Offset fd_start[0..nprocs_for_coll-1] + * starting location of "file domain"; region that a given process will + * perform aggregation for (i.e. actually do I/O) + * ADIO_Offset fd_end[0..nprocs_for_coll-1] + * start + size - 1 roughly, but it can be less, or 0, in the case of + * uneven distributions + * + * Description from common/ad_aggregate.c. + * ADIOI_Calc_aggregator() + * + * The intention here is to implement a function which provides basically + * the same functionality as in Rajeev's original version of + * ADIOI_Calc_my_req(). He used a ceiling division approach to assign the + * file domains, and we use the same approach here when calculating the + * location of an offset/len in a specific file domain. Further we assume + * this same distribution when calculating the rank_index, which is later + * used to map to a specific process rank in charge of the file domain. + * + * A better (i.e. more general) approach would be to use the list of file + * domains only. This would be slower in the case where the + * original ceiling division was used, but it would allow for arbitrary + * distributions of regions to aggregators. We'd need to know the + * nprocs_for_coll in that case though, which we don't have now. + * + * this code doesn't necessarily return a rank in the range + * 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a + * result of the rank mapping; any set of ranks in the communicator could be + * used now. + * Returns an integer representing a rank in the collective I/O communicator. + * The "len" parameter is also modified to indicate the amount of data + * actually available in this file domain. + * + * This is more general aggregator search function which does not base on the assumption + * that each aggregator hosts the file domain with the same size + */ +int ADIOI_OCEANFS_Calc_aggregator(ADIO_File fd, ADIO_Offset off, ADIO_Offset min_off, ADIO_Offset *len, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end) +{ + int rank_index, rank; + ADIO_Offset avail_bytes; + + ADIOI_Assert((off <= fd_end[fd->hints->cb_nodes - 1] && off >= min_off && fd_start[0] >= min_off)); + + if (fd_size != 0) { + + /* get an index into our array of aggregators */ + rank_index = (int)((off - min_off + fd_size) / fd_size - 1); + + if (fd->hints->striping_unit > 0) { + /* wkliao: implementation for file domain alignment + fd_start[] and fd_end[] have been aligned with file lock + boundaries when returned from ADIOI_Calc_file_domains() so cannot + just use simple arithmatic as above */ + rank_index = 0; + while (off > fd_end[rank_index]) { + rank_index++; + } + } + } else { + rank_index = -1; + } + /* we index into fd_end with rank_index, and fd_end was allocated to be no + * bigger than fd->hins->cb_nodes. If we ever violate that, we're + * overrunning arrays. Obviously, we should never ever hit this abort */ + if (rank_index >= fd->hints->cb_nodes || rank_index < 0) { + MPI_Abort(MPI_COMM_WORLD, 1); + } + + /* + * remember here that even in Rajeev's original code it was the case that + * different aggregators could end up with different amounts of data to + * aggregate. here we use fd_end[] to make sure that we know how much + * data this aggregator is working with. + * + * the +1 is to take into account the end vs. length issue. + */ + avail_bytes = fd_end[rank_index] + 1 - off; + if (avail_bytes < *len && avail_bytes > 0) { + /* this file domain only has part of the requested contig. region */ + *len = avail_bytes; + } + + /* map our index to a rank */ + /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */ + rank = fd->hints->ranklist[rank_index]; + + return rank; +} + +blksize_t ad_aggrs_PreprocBlksize(ADIO_File fd) +{ + static const int blksize = 1048576; + if (fd->blksize <= 0) { + /* default to 1M if blksize unset */ + fd->blksize = blksize; + } + + return fd->blksize; +} + +/* + * Compute a dynamic access range based file domain partition among I/O aggregators, + * which align to the OCEANFS block size + * Divide the I/O workload among "nprocs_for_coll" processes. This is + * done by (logically) dividing the file into file domains (FDs); each + * process may directly access only its own file domain. + * Additional effort is to make sure that each I/O aggregator get + * a file domain that aligns to the OCEANFS block size. So, there will + * not be any false sharing of OCEANFS file blocks among multiple I/O nodes. + * + * The common version of this now accepts a min_fd_size and striping_unit. + * It doesn't seem necessary here (using OCEANFS block sizes) but keep it in mind + * (e.g. we could pass striping unit instead of using fs_ptr->blksize). + */ +void ADIOI_OCEANFS_Calc_file_domains(ADIO_File fd, ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, + int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, + ADIO_Offset *fd_size_ptr, void *fs_ptr) +{ + ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; + int i, aggr; + + OCEANFS_Log_event(MPE_Log_ID_5004); + + blksize_t blksize = ad_aggrs_PreprocBlksize(fd); + + /* find min of start offsets and max of end offsets of all processes */ + min_st_offset = st_offsets[0]; + max_end_offset = end_offsets[0]; + for (i = 1; i < nprocs; i++) { + min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); + max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); + } + + /* determine the "file domain (FD)" of each process, i.e., the portion of + the file that will be "owned" by each process */ + ADIO_Offset oceanfs_ub = (max_end_offset + blksize - 1) / blksize * blksize - 1; + ADIO_Offset oceanfs_lb = min_st_offset / blksize * blksize; + ADIO_Offset oceanfs_ub_rdoff = (max_end_offset + blksize - 1) / blksize * blksize - 1 - max_end_offset; + ADIO_Offset oceanfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; + ADIO_Offset fd_oceanfs_range = oceanfs_ub - oceanfs_lb + 1; + + int naggs = nprocs_for_coll; + + /* Tweak the file domains so that no fd is smaller than a threshold. We + * have to strike a balance between efficency and parallelism: somewhere + * between 10k processes sending 32-byte requests and one process sending a + * 320k request is a (system-dependent) sweet spot + + This is from the common code - the new min_fd_size parm that we didn't implement. + (And common code uses a different declaration of fd_size so beware) + + if (fd_size < min_fd_size) + fd_size = min_fd_size; + */ + fd_size = (ADIO_Offset *)ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); + *fd_start_ptr = (ADIO_Offset *)ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); + *fd_end_ptr = (ADIO_Offset *)ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); + fd_start = *fd_start_ptr; + fd_end = *fd_end_ptr; + + /* each process will have a file domain of some number of OCEANFS blocks, but + * the division of blocks is not likely to be even. Some file domains will + * be "large" and others "small" + * + * Example: consider 17 blocks distributed over 3 aggregators. + * nb_cn_small = 17/3 = 5 + * naggs_large = 17 - 3*(17/3) = 17 - 15 = 2 + * naggs_small = 3 - 2 = 1 + * + * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks} + * + * what about (relatively) small files? say, a file of 1000 blocks + * distributed over 2064 aggregators: + * nb_cn_small = 1000/2064 = 0 + * naggs_large = 1000 - 2064*(1000/2064) = 1000 + * naggs_small = 2064 - 1000 = 1064 + * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...} + * + * it might be a good idea instead of having all the zeros up front, to + * "mix" those zeros into the fd_size array. that way, no pset/bridge-set + * is left with zero work. In fact, even if the small file domains aren't + * zero, it's probably still a good idea to mix the "small" file domains + * across the fd_size array to keep the io nodes in balance */ + ADIO_Offset n_oceanfs_blk = fd_oceanfs_range / blksize; + ADIO_Offset nb_cn_small = n_oceanfs_blk / naggs; + ADIO_Offset naggs_large = n_oceanfs_blk - naggs * (n_oceanfs_blk / naggs); + ADIO_Offset fd_size_small = (nb_cn_small + 1) * blksize; + ADIO_Offset fd_size_large = nb_cn_small * blksize; + + for (i = 0; i < naggs; i++) { + if (i < naggs_large) { + fd_size[i] = fd_size_small; + } else { + fd_size[i] = fd_size_large; + } + } + + fd_size[0] -= oceanfs_lb_rdoff; + fd_size[naggs - 1] -= oceanfs_ub_rdoff; + + /* compute the file domain for each aggr */ + ADIO_Offset offset = min_st_offset; + for (aggr = 0; aggr < naggs; aggr++) { + fd_start[aggr] = offset; + fd_end[aggr] = offset + fd_size[aggr] - 1; + offset += fd_size[aggr]; + } + + *fd_size_ptr = fd_size[0]; + *min_st_offset_ptr = min_st_offset; + + OCEANFS_Log_event(MPE_Log_ID_5005); + + ADIOI_Free(fd_size); +} + +/* + * ADIOI_OCEANFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation + * is specific for static file domain partitioning. + * + * ADIOI_Calc_my_req() - calculate what portions of the access requests + * of this process are located in the file domains of various processes + * (including this one) + */ +void ADIOI_OCEANFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset fd_size, int nprocs, + int *count_procs_ptr, int **count_per_proc_ptr, ADIOI_Access **my_req_ptr, OCEANFS_Int **buf_idx_ptr) +/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? + They are used as memory buffer indices so it seems like the 2G limit is in effect */ +{ + int *count_my_req_per_proc = NULL; + OCEANFS_Int *buf_idx = NULL; + int i, l, proc; + ADIO_Offset fd_len, rem_len, curr_idx, off; + ADIOI_Access *my_req = NULL; + TRACE_ERR("Entering ADIOI_OCEANFS_Calc_my_req\n"); + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5024, 0, NULL); +#endif + /* count_my_req_per_proc[i] gives the no. of contig. requests of this + process in process i's file domain. calloc initializes to zero. + I'm allocating memory of size nprocs, so that I can do an + MPI_Alltoall later on. + */ + *count_per_proc_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + count_my_req_per_proc = *count_per_proc_ptr; + + /* buf_idx is relevant only if buftype_is_contig. + buf_idx[i] gives the index into user_buf where data received + from proc. i should be placed. This allows receives to be done + without extra buffer. This can't be done if buftype is not contig. + */ + buf_idx = (OCEANFS_Int *)ADIOI_Malloc(nprocs * sizeof(OCEANFS_Int)); + + /* initialize buf_idx to -1 */ + for (i = 0; i < nprocs; i++) { + buf_idx[i] = -1; + } + + /* one pass just to calculate how much space to allocate for my_req; + * contig_access_count was calculated way back in ADIOI_Calc_my_off_len() + */ + for (i = 0; i < contig_access_count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (len_list[i] == 0) { + continue; + } + off = offset_list[i]; + fd_len = len_list[i]; + /* note: we set fd_len to be the total size of the access. then + * ADIOI_Calc_aggregator() will modify the value to return the + * amount that was available from the file domain that holds the + * first part of the access. + */ + /* BES */ + proc = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); + count_my_req_per_proc[proc]++; + + /* figure out how much data is remaining in the access (i.e. wasn't + * part of the file domain that had the starting byte); we'll take + * care of this data (if there is any) in the while loop below. + */ + rem_len = len_list[i] - fd_len; + + while (rem_len > 0) { + off += fd_len; /* point to first remaining byte */ + fd_len = rem_len; /* save remaining size, pass to calc */ + proc = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); + + count_my_req_per_proc[proc]++; + rem_len -= fd_len; /* reduce remaining length by amount from fd */ + } + } + + /* now allocate space for my_req, offset, and len */ + *my_req_ptr = (ADIOI_Access *)ADIOI_Malloc(nprocs * sizeof(ADIOI_Access)); + my_req = *my_req_ptr; + + AllocAccess(my_req, nprocs, count_my_req_per_proc, count_procs_ptr); + + /* now fill in my_req */ + curr_idx = 0; + for (i = 0; i < contig_access_count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (len_list[i] == 0) { + continue; + } + off = offset_list[i]; + fd_len = len_list[i]; + proc = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); + + /* for each separate contiguous access from this process */ + if (buf_idx[proc] == -1) { + ADIOI_Assert(curr_idx == (int)curr_idx); + buf_idx[proc] = (OCEANFS_Int)curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + + rem_len = len_list[i] - fd_len; + + /* store the proc, offset, and len information in an array + * of structures, my_req. Each structure contains the + * offsets and lengths located in that process's FD, + * and the associated count. + */ + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + + while (rem_len > 0) { + off += fd_len; + fd_len = rem_len; + proc = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); + + if (buf_idx[proc] == -1) { + ADIOI_Assert(curr_idx == (int)curr_idx); + buf_idx[proc] = (OCEANFS_Int)curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + rem_len -= fd_len; + + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + } + } + + *buf_idx_ptr = buf_idx; +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5025, 0, NULL); +#endif + TRACE_ERR("Leaving ADIOI_OCEANFS_Calc_my_req\n"); +} + +void ad_aggrs_CalcOffset(int nprocs, int* scounts, int* sdispls, int* rcounts, int* rdispls, + int* count_my_req_per_proc, int* count_others_req_per_proc, ADIOI_Access *my_req, + ADIOI_Access *others_req, void* sendBufForOffsets, void* recvBufForOffsets) +{ + int i; + for (i = 0; i < nprocs; i++) { + /* Send these offsets to process i. */ + scounts[i] = count_my_req_per_proc[i]; + if (scounts[i] == 0) { + sdispls[i] = 0; + } else { + sdispls[i] = (int)(((MPIU_Upint)my_req[i].offsets - (MPIU_Upint)sendBufForOffsets) / + (MPIU_Upint)sizeof(ADIO_Offset)); + } + /* Receive these offsets from process i. */ + rcounts[i] = count_others_req_per_proc[i]; + if (rcounts[i] == 0) { + rdispls[i] = 0; + } else { + rdispls[i] = (int)(((MPIU_Upint)others_req[i].offsets - (MPIU_Upint)recvBufForOffsets) / + (MPIU_Upint)sizeof(ADIO_Offset)); + } + } +} + +void ad_aggrs_CalcLen(int nprocs, int* scounts, int* sdispls, int* rcounts, int* rdispls, + int* count_my_req_per_proc, int* count_others_req_per_proc, ADIOI_Access *my_req, + ADIOI_Access *others_req, void* sendBufForLens, void* recvBufForLens) +{ + int i; + for (i = 0; i < nprocs; i++) { + /* Send these lengths to process i. */ + scounts[i] = count_my_req_per_proc[i]; + if (scounts[i] == 0) { + sdispls[i] = 0; + } else { + sdispls[i] = (int)(((MPIU_Upint)my_req[i].lens - (MPIU_Upint)sendBufForLens) / (MPIU_Upint)sizeof(ADIO_Offset)); + } + /* Receive these offsets from process i. */ + rcounts[i] = count_others_req_per_proc[i]; + if (rcounts[i] == 0) { + rdispls[i] = 0; + } else { + rdispls[i] = (int)(((MPIU_Upint)others_req[i].lens - (MPIU_Upint)recvBufForLens) / (MPIU_Upint)sizeof(ADIO_Offset)); + } + } +} + +/* + * ADIOI_Calc_others_req (switched to all to all for performance) + * + * param[in] count_my_req_procs Number of processes whose file domain my + * request touches. + * param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of + * contig. requests of this process in + * process i's file domain. + * param[in] my_req A structure defining my request + * param[in] nprocs Number of nodes in the block + * param[in] myrank Rank of this node + * param[out] count_others_req_proc_ptr Number of processes whose requests lie in + * my process's file domain (including my + * process itself) + * param[out] others_req_ptr Array of other process' requests that lie + * in my process's file domain + */ +void ADIOI_OCEANFS_Calc_others_req(ADIO_File fd, int count_my_req_procs, int *count_my_req_per_proc, ADIOI_Access *my_req, + int nprocs, int myrank, int *count_others_req_procs_ptr, ADIOI_Access **others_req_ptr) +{ + /* count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + int *count_others_req_per_proc = NULL; + int *scounts = ADIOI_Malloc(nprocs * sizeof(int)); + int *sdispls = ADIOI_Malloc(nprocs * sizeof(int)); + int *rcounts = ADIOI_Malloc(nprocs * sizeof(int)); + int *rdispls = ADIOI_Malloc(nprocs * sizeof(int)); + int i; + ADIOI_Access *others_req = NULL; + + /* Parameters for MPI_Alltoallv. These are the buffers, which + * are later computed to be the lowest address of all buffers + * to be sent/received for offsets and lengths. Initialize to + * the highest possible address which is the current minimum. + */ + void *sendBufForOffsets = (void *)0xFFFFFFFFFFFFFFFF; + void *sendBufForLens = (void *)0xFFFFFFFFFFFFFFFF; + void *recvBufForOffsets = (void *)0xFFFFFFFFFFFFFFFF; + void *recvBufForLens = (void *)0xFFFFFFFFFFFFFFFF; + + /* first find out how much to send/recv and from/to whom */ +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5026, 0, NULL); +#endif + /* Send 1 int to each process. count_my_req_per_proc[i] is the number of + * requests that my process will do to the file domain owned by process[i]. + * Receive 1 int from each process. count_others_req_per_proc[i] is the number of + * requests that process[i] will do to the file domain owned by my process. + */ + count_others_req_per_proc = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT, count_others_req_per_proc, 1, MPI_INT, fd->comm); + + /* total_cora2a+=timebase()-cora2a1; + * Allocate storage for an array of other nodes' accesses of our + * node's file domain. Also allocate storage for the alltoallv + * parameters. + */ + *others_req_ptr = (ADIOI_Access *)ADIOI_Malloc(nprocs * sizeof(ADIOI_Access)); + others_req = *others_req_ptr; + + /* If process[i] has any requests in my file domain, + * initialize an ADIOI_Access structure that will describe each request + * from process[i]. The offsets, lengths, and buffer pointers still need + * to be obtained to complete the setting of this structure. + */ + AllocOtherReq(nprocs, count_others_req_per_proc, &recvBufForOffsets, &recvBufForLens, others_req, + count_others_req_procs_ptr); + + /* Now send the calculated offsets and lengths to respective processes */ + /* ********************** */ + /* Exchange the offsets */ + /* ********************** */ + /* Determine the lowest sendBufForOffsets/Lens */ + for (i = 0; i < nprocs; i++) { + if ((my_req[i].count) && ((MPIU_Upint)my_req[i].offsets <= (MPIU_Upint)sendBufForOffsets)) { + sendBufForOffsets = my_req[i].offsets; + } + + if ((my_req[i].count) && ((MPIU_Upint)my_req[i].lens <= (MPIU_Upint)sendBufForLens)) { + sendBufForLens = my_req[i].lens; + } + } + + /* If no send buffer was found in the loop above, make it NULL */ + CheckOffsetAndLen(&recvBufForOffsets, &recvBufForLens); + + /* Calculate the displacements from the sendBufForOffsets/Lens */ + ad_aggrs_CalcOffset(nprocs, scounts, sdispls, rcounts, rdispls, count_my_req_per_proc, + count_others_req_per_proc, my_req, others_req, sendBufForOffsets, recvBufForOffsets); + + /* Exchange the offsets */ + MPI_Alltoallv(sendBufForOffsets, scounts, sdispls, ADIO_OFFSET, recvBufForOffsets, rcounts, rdispls, ADIO_OFFSET, fd->comm); + + /* ********************** */ + /* Exchange the lengths */ + /* ********************** */ + ad_aggrs_CalcLen(nprocs, scounts, sdispls, rcounts, rdispls, count_my_req_per_proc, + count_others_req_per_proc, my_req, others_req, sendBufForLens, recvBufForLens); + + /* Exchange the lengths */ + MPI_Alltoallv(sendBufForLens, scounts, sdispls, ADIO_OFFSET, recvBufForLens, rcounts, rdispls, ADIO_OFFSET, fd->comm); + + /* Clean up */ + FreeAdioiFive(count_others_req_per_proc, scounts, sdispls, rcounts, rdispls); + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5027, 0, NULL); +#endif + TRACE_ERR("Leaving ADIOI_OCEANFS_Calc_others_req\n"); +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.h new file mode 100644 index 00000000000..5b829483736 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_aggrs.h @@ -0,0 +1,47 @@ +#ifndef AD_OCEANFS_AGGRS_H_ +#define AD_OCEANFS_AGGRS_H_ + +#include +#include "adio.h" +#include "ad_oceanfs_pub.h" + +#ifdef HAVE_OCEANFS_H +#include +#endif + +/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */ +void ADIOI_OCEANFS_Calc_file_domains(ADIO_File fd, ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, + int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, + ADIO_Offset *fd_size_ptr, void *fs_ptr); + +/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for + static file domain partitioning */ +int ADIOI_OCEANFS_Calc_aggregator(ADIO_File fd, ADIO_Offset off, ADIO_Offset min_off, ADIO_Offset *len, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end); + +/* overriding ADIOI_Calc_my_req for the default implementation is specific for + static file domain partitioning */ +void ADIOI_OCEANFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset fd_size, int nprocs, + int *count_procs_ptr, int **count_per_proc_ptr, ADIOI_Access **my_req_ptr, OCEANFS_Int **buf_idx_ptr); +/* + * ADIOI_Calc_others_req + * + * param[in] count_my_req_procs Number of processes whose file domain my + * request touches. + * param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of + * contig. requests of this process in + * process i's file domain. + * param[in] my_req A structure defining my request + * param[in] nprocs Number of nodes in the block + * param[in] myrank Rank of this node + * param[out] count_others_req_proc_ptr Number of processes whose requests lie in + * my process's file domain (including my + * process itself) + * param[out] others_req_ptr Array of other process' requests that lie + * in my process's file domain + */ +void ADIOI_OCEANFS_Calc_others_req(ADIO_File fd, int count_my_req_procs, int *count_my_req_per_proc, ADIOI_Access *my_req, + int nprocs, int myrank, int *count_others_req_procs_ptr, ADIOI_Access **others_req_ptr); + +#endif /* AD_OCEANFS_AGGRS_H_ */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_close.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_close.c new file mode 100644 index 00000000000..58966b2b23d --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_close.c @@ -0,0 +1,44 @@ +#include "ad_oceanfs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "ad_oceanfs_group_tuning.h" +#include "mpi_fs_intf.h" + +void ADIOI_OCEANFS_Close(ADIO_File fd, int *error_code) +{ + static char myname[] = "ADIOI_OCEANFS_CLOSE"; + int ret; + int tmp_error_code; + ADIOI_OCEANFS_fs *ocean_fs = NULL; + + ret = mpi_fs_close(fd->fd_sys); + if (ret != 0) { + tmp_error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, + "Error in mpi_fs_close", 0); + } else { + tmp_error_code = MPI_SUCCESS; + } + + if (error_code) { + *error_code = tmp_error_code; + } + + ocean_fs = (ADIOI_OCEANFS_fs *)fd->fs_ptr; + if (ocean_fs) { + if (ocean_fs->oceanfs_filename) { + ADIOI_Free(ocean_fs->oceanfs_filename); + ocean_fs->oceanfs_filename = NULL; + } + if (ocean_fs->context) { + ad_oceanfs_group_report(fd, ocean_fs->context->group_id); + ADIOI_Free(ocean_fs->context); + ocean_fs->context = NULL; + } + ADIOI_Free(ocean_fs); + fd->fs_ptr = NULL; + } + + /* reset fds */ + fd->fd_direct = -1; + fd->fd_sys = -1; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.c new file mode 100644 index 00000000000..1822f36ac14 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.c @@ -0,0 +1,63 @@ +#include +#include +#include "ad_oceanfs.h" +#include "ad_oceanfs_common.h" + +/* keyval hack to both tell us if we've already initialized im and also + * close it down when mpi exits */ +static int g_Initialized = MPI_KEYVAL_INVALID; + +void ADIOI_OCEANFS_Init(int rank, int *error_code) +{ + /* do nothing if we've already fired up the OCEANFS interface */ + if (g_Initialized != MPI_KEYVAL_INVALID) { + *error_code = MPI_SUCCESS; + return; + } + + *error_code = MPI_SUCCESS; + + /* just like romio does, we make a dummy attribute so we + * get cleaned up */ +} + +int ADIOI_OCEANFS_Set_lock(FDTYPE fd, int cmd, int type, ADIO_Offset offset, int whence, ADIO_Offset len) +{ + int err, sav_errno; + int err_count = 0; + struct flock lock; + static const int ten_thousand = 10000; + + if (len == 0) { + return MPI_SUCCESS; + } + +#ifdef NEEDS_INT_CAST_WITH_FLOCK + lock.l_type = type; + lock.l_start = (int)offset; + lock.l_whence = whence; + lock.l_len = (int)len; +#else + lock.l_type = type; + lock.l_whence = whence; + lock.l_start = offset; + lock.l_len = len; +#endif + + sav_errno = errno; /* save previous errno in case we recover from retryable errors */ + errno = 0; + do { + err = fcntl(fd, cmd, &lock); + } while (err && ((errno == EINTR) || ((errno == EINPROGRESS) && (++err_count < ten_thousand)))); + + if (err && (errno != EBADF)) { + /* FIXME: This should use the error message system, especially for MPICH */ + MPI_Abort(MPI_COMM_WORLD, 1); + } + + if (!err) { /* report fcntl failure errno's (EBADF), otherwise */ + errno = sav_errno; /* restore previous errno in case we recovered from retryable errors */ + } + + return (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.h new file mode 100644 index 00000000000..a533182dde9 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_common.h @@ -0,0 +1,17 @@ +#ifndef AD_OCEANFS_COMMON_H_INCLUDED +#define AD_OCEANFS_COMMON_H_INCLUDED +#include +#include "ad_oceanfs.h" + +typedef struct { + uint64_t group_id; +} MPI_CONTEXT_T; + +typedef struct { + char *oceanfs_filename; + MPI_CONTEXT_T *context; +} ADIOI_OCEANFS_fs; + +void ADIOI_OCEANFS_Init(int rank, int *error_code); + +#endif /* AD_OCEANFS_COMMON_H_INCLUDED */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_fcntl.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_fcntl.c new file mode 100644 index 00000000000..6d3e8c029be --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_fcntl.c @@ -0,0 +1,49 @@ +#include "ad_oceanfs.h" +#include "adio_extern.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "mpi_fs_intf.h" + +void ADIOI_OCEANFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) +{ + int ret; + static char myname[] = "ADIOI_OCEANFS_FCNTL"; + + switch (flag) { + case ADIO_FCNTL_GET_FSIZE: { + struct stat stbuf; + + stbuf.st_size = 0; + ret = mpi_fs_stat(fd->filename, &stbuf); + if (ret) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_FILE, + "Error in mpi_fs_stat", 0); + return; + } + + fcntl_struct->fsize = stbuf.st_size; + *error_code = MPI_SUCCESS; + break; + } + case ADIO_FCNTL_SET_DISKSPACE: + MPIO_CHECK_NOT_SEQUENTIAL_MODE(fd, myname, *error_code); + if (fd->split_coll_count || fd->async_count) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, + "**io", "**io %s", strerror(errno)); + return; + } + ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code); + break; + + case ADIO_FCNTL_SET_ATOMICITY: + fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; + *error_code = MPI_SUCCESS; + break; + default: + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, + "**flag", "**flag %d", flag); + break; + } +fn_exit: + return; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.c new file mode 100644 index 00000000000..58c3095e9d9 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.c @@ -0,0 +1,224 @@ +#include "ad_oceanfs_file.h" +#include + +#define FILE_VALUE_MAX_LEN 128 +#define ONE_DIR_LEN 256 + +static int file_create(char *file_name, int create, int row, int col, int col_len) +{ + if (create != FILE_CREATE_FORCE) { + FILE *fpRead = fopen(file_name, "rb"); + if (fpRead) { + fclose(fpRead); + return 0; + } else if (create == FILE_CREATE_NOT) { + return 0; + } + } + + FILE *fpCreate = fopen(file_name, "w+t"); + + if (fpCreate == NULL) { + return 0; + } + + int i, j, k; + for (i = 0; i < row; i++) { + for (j = 0; j < col; j++) { + fprintf(fpCreate, "0"); + for (k = 1; k < col_len; k++) { + fprintf(fpCreate, " "); + } + } + fprintf(fpCreate, "\n"); + } + + fclose(fpCreate); + return 1; +} + +static int is_start(char c) +{ + return c != '.' && c != '/'; +} + +static int save_str(char* dst, int dst_size, char* head, char* end) +{ + int src_len = end - head; + if (src_len >= dst_size) { + return -1; + } + + int i; + for (i = 0; i < src_len; i++) { + dst[i] = head[i]; + } + dst[src_len] = 0; + return 0; +} + +static int create_dir(char *file_name) +{ + static const int rwx = 0777; + char* pc_end = file_name; + int i; + for (i = strlen(file_name) - 1; i >= 0; i--) { + if (file_name[i] == '/') { + pc_end = file_name + i; + break; + } + } + + char pc_dir[ONE_DIR_LEN]; + if (save_str(pc_dir, sizeof(pc_dir), file_name, pc_end) < 0) { + return -1; + } + + if (access(pc_dir, F_OK) >= 0) { + return 0; + } + + char* pc_head = NULL; + for (pc_head = file_name; pc_head < pc_end; pc_head++) { + if (is_start(*pc_head)) { + break; + } + } + + char* pc = NULL; + for (pc = pc_head; pc <= pc_end; pc++) { + if (*pc == '/') { + if (save_str(pc_dir, sizeof(pc_dir), file_name, pc) < 0) { + return -1; + } + + if (access(pc_dir, F_OK) >= 0) { + continue; + } + + if (mkdir(pc_dir, rwx) < 0) { + return -1; + } + } + } + + return 0; +} + +TAdOceanfsFile *ad_oceanfs_file_init(char *file_name, int create, int row, int col, int col_len, char **row_head, + int row_head_size, char **col_head, int col_head_size) +{ + create_dir(file_name); + + int new = file_create(file_name, create, row + 1, col + 1, col_len); + + FILE *fp = fopen(file_name, "r+"); + if (fp == NULL) { + return NULL; + } + + TAdOceanfsFile *ret = (TAdOceanfsFile *)malloc(sizeof(TAdOceanfsFile)); + if (ret == NULL) { + return NULL; + } + + ret->fp = fp; + ret->row = row; + ret->col = col; + ret->col_len = col_len; + ret->row_len = ret->col_len * (ret->col + 1) + 1; + ret->new = new; + + int index; + int i; + for (i = 0; i < col; i++) { + if (i >= col_head_size) { + index = i % col_head_size; + } else { + index = i; + } + ad_oceanfs_file_set(ret, -1, i, col_head[index]); + } + for (i = 0; i < row; i++) { + if (i >= col_head_size && row_head_size != 0) { + index = i % row_head_size; + } else { + index = i; + } + ad_oceanfs_file_set(ret, i, -1, row_head[index]); + } + + return ret; +} + +int ad_oceanfs_file_set(TAdOceanfsFile *oceanfs_file, int row, int col, char *val) +{ + int local_r = row + 1; + int local_c = col + 1; + + if (oceanfs_file == NULL || local_r < 0 || local_r > oceanfs_file->row || local_c < 0 || local_c > oceanfs_file->col) { + return -1; + } + + int ret = fseek(oceanfs_file->fp, oceanfs_file->row_len * local_r + oceanfs_file->col_len * local_c, SEEK_SET); + if (ret < 0) { + return ret; + } + + fprintf(oceanfs_file->fp, "%s", val); + + return 0; +} + +int ad_oceanfs_file_set_double(TAdOceanfsFile *oceanfs_file, int row, int col, double val) +{ + if (oceanfs_file == NULL) { + return -1; + } + + char ps_format[FILE_VALUE_MAX_LEN]; + snprintf(ps_format, sizeof(ps_format), "%c-%d.6lf", '%', oceanfs_file->col_len); + char ps_val[FILE_VALUE_MAX_LEN]; + snprintf(ps_val, sizeof(ps_val), ps_format, val); + ad_oceanfs_file_set(oceanfs_file, row, col, ps_val); + return 0; +} + +int ad_oceanfs_file_set_llu(TAdOceanfsFile *oceanfs_file, int row, int col, unsigned long long val) +{ + if (oceanfs_file == NULL) { + return -1; + } + + char ps[FILE_VALUE_MAX_LEN]; + snprintf(ps, sizeof(ps), "%llu", val); + ad_oceanfs_file_set(oceanfs_file, row, col, ps); + return 0; +} + +int ad_oceanfs_file_set_int(TAdOceanfsFile *oceanfs_file, int row, int col, int val) +{ + if (oceanfs_file == NULL) { + return -1; + } + + char ps[FILE_VALUE_MAX_LEN]; + snprintf(ps, sizeof(ps), "%d", val); + ad_oceanfs_file_set(oceanfs_file, row, col, ps); + return 0; +} + +void ad_oceanfs_file_destroy(TAdOceanfsFile *oceanfs_file) +{ + if (oceanfs_file == NULL) { + return; + } + + if (oceanfs_file->fp) { + fclose(oceanfs_file->fp); + oceanfs_file->fp = NULL; + } + + free(oceanfs_file); + oceanfs_file = NULL; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.h new file mode 100644 index 00000000000..d9a3a6a9e27 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_file.h @@ -0,0 +1,29 @@ +#ifndef AD_OCEANFS_FILE_H_ +#define AD_OCEANFS_FILE_H_ + +#include "adio.h" + +typedef struct { + FILE *fp; + int row; + int col; + int col_len; + int row_len; + int new; +} TAdOceanfsFile; + +enum { + FILE_CREATE_NOT = 0, + FILE_CREATE_INTIME, + FILE_CREATE_FORCE +}; + +TAdOceanfsFile *ad_oceanfs_file_init(char *file_name, int create, int row, int col, int col_len, char **row_head, + int row_head_size, char **col_head, int col_head_size); +int ad_oceanfs_file_set(TAdOceanfsFile *oceanfs_file, int row, int col, char *val); +int ad_oceanfs_file_set_double(TAdOceanfsFile *oceanfs_file, int row, int col, double val); +int ad_oceanfs_file_set_llu(TAdOceanfsFile *oceanfs_file, int row, int col, unsigned long long val); +int ad_oceanfs_file_set_int(TAdOceanfsFile *oceanfs_file, int row, int col, int val); +void ad_oceanfs_file_destroy(TAdOceanfsFile *oceanfs_file); + +#endif /* AD_OCEANFS_FILE_H_ */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.c new file mode 100644 index 00000000000..27e1e31b94c --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.c @@ -0,0 +1,62 @@ +#include "ad_oceanfs_group_tuning.h" +#include +#include "ad_oceanfs_file.h" +#include "ad_env.h" +#include "mpi.h" + + +#define GROUP_TUNING_SIZE 4 + +static int g_row_cnt = GROUP_TUNING_SIZE; +static uint64_t g_group[GROUP_TUNING_SIZE] = {0, 0, 0, 0}; +static uint64_t g_cnt[GROUP_TUNING_SIZE] = {0, 0, 0, 0}; +static int g_index = 0; +static int g_col_cnt = 2; +static int g_col_len = 25; +static char* g_head_row[] = {"1", "2", "3", "4"}; +static int g_head_row_size = 4; +static char* g_head_col[] = {"GroupId", "Count"}; +static int g_head_col_size = 2; + +void ad_oceanfs_group_report(ADIO_File fd, uint64_t group_id) +{ + if (get_oceanfsmpio_timing() == 0 || group_id <= 0) { + return; + } + + static int g_dir_len = 128; + char pname[g_dir_len]; + snprintf(pname, sizeof(pname), "/mpi_state/group%d", getpid()); + + TAdOceanfsFile *oceanfs_file = ad_oceanfs_file_init(pname, FILE_CREATE_INTIME, g_row_cnt, g_col_cnt, g_col_len, + g_head_row, g_head_row_size, g_head_col, g_head_col_size); + + if (oceanfs_file == NULL) { + return; + } + + if (oceanfs_file->new) { + g_index = 0; + } + + int find = 0; + int j; + for (j = 0; j < g_index; j++) { + if (g_group[j] == group_id) { + g_cnt[j]++; + ad_oceanfs_file_set_llu(oceanfs_file, j, 1, g_cnt[j]); + find = 1; + break; + } + } + + if (find == 0 && g_index < g_row_cnt) { + g_group[g_index] = group_id; + g_cnt[g_index] = 1; + ad_oceanfs_file_set_llu(oceanfs_file, g_index, 0, g_group[g_index]); + ad_oceanfs_file_set_llu(oceanfs_file, g_index, 1, 1); + g_index++; + } + + ad_oceanfs_file_destroy(oceanfs_file); +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.h new file mode 100644 index 00000000000..34b631257a0 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_group_tuning.h @@ -0,0 +1,9 @@ +#ifndef AD_OCEANFS_GROUP_TUNING_H_ +#define AD_OCEANFS_GROUP_TUNING_H_ + +#include +#include "adio.h" + +void ad_oceanfs_group_report(ADIO_File fd, uint64_t group_id); + +#endif /* AD_OCEANFS_GROUP_TUNING_H_ */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_hints.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_hints.c new file mode 100644 index 00000000000..19a6b91ad29 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_hints.c @@ -0,0 +1,62 @@ +#include "ad_oceanfs.h" +#include "adio_extern.h" +#include "ad_oceanfs_pub.h" +#include "hint_fns.h" + +#ifdef HAVE_LIMITS_H +#include +#endif + +void ADIOI_OCEANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) +{ + char *value = NULL; + int flag; + static char myname[] = "ADIOI_OCEANFS_SETINFO"; + + value = (char *)ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + if (fd->info == MPI_INFO_NULL) { + /* This must be part of the open call. can set striping parameters + if necessary. */ + MPI_Info_create(&(fd->info)); + + ADIOI_Info_set(fd->info, "direct_read", "false"); + ADIOI_Info_set(fd->info, "direct_write", "false"); + fd->direct_read = fd->direct_write = 0; + /* TODO initialize oceanfs hints */ + ADIOI_Info_set(fd->info, "view_io", "false"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_DISABLE; + + if (users_info != MPI_INFO_NULL) { + /* TODO striping information */ + + /* view io */ + ADIOI_Info_get(users_info, "view_io", MPI_MAX_INFO_VAL, value, &flag); + if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { + ADIOI_Info_set(fd->info, "view_io", "true"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_ENABLE; + } + } + } + + /* get other hint */ + if (users_info != MPI_INFO_NULL) { + /* TO DO */ + } + + /* set internal variables for tuning environment variables */ + if (!fd->hints->initialized) { + ad_oceanfs_get_env_vars(); + } + + /* set the values for collective I/O and data sieving parameters */ + ADIOI_GEN_SetInfo(fd, users_info, error_code); + + /* generic hints might step on striping_unit */ + if (users_info != MPI_INFO_NULL) { + ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", NULL, myname, error_code); + } + + ADIOI_Free(value); + + *error_code = MPI_SUCCESS; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_io.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_io.c new file mode 100644 index 00000000000..bf8392428e3 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_io.c @@ -0,0 +1,111 @@ +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_pub.h" +#include "mpi_fs_intf.h" + +static int g_io_size_max = 1048576; // 1024 * 1024 + +static int OCEANFS_IO_Pread(ADIO_File fd, char *p_buf, ADIO_Offset wr_len, off_t offset) +{ + double t = ad_oceanfs_timing_get_time(); + int ret = mpi_fs_pread(fd->fd_sys, p_buf, wr_len, offset); + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_R_OCEANFS, t); + return ret; +} + +static int OCEANFS_IO_Pwrite(ADIO_File fd, char *p_buf, ADIO_Offset wr_len, off_t offset) +{ + double t = ad_oceanfs_timing_get_time(); + int ret = mpi_fs_pwrite(fd->fd_sys, p_buf, wr_len, offset); + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_W_OCEANFS, t); + return ret; +} + +static int OCEANFS_IO(int io_flag, ADIO_File fd, char *p_buf, ADIO_Offset wr_len, off_t offset) +{ + int ret = -1; + switch (io_flag) { + case OCEANFS_READ: { + ret = OCEANFS_IO_Pread(fd, p_buf, wr_len, offset); + break; + } + case OCEANFS_WRITE: { + ret = OCEANFS_IO_Pwrite(fd, p_buf, wr_len, offset); + break; + } + default: { + break; + } + } + return ret; +} + +static void OCEANFS_IOContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int io_flag, int *error_code) +{ + int ret = 0; + MPI_Count datatype_size; + ADIO_Offset mem_len, wr_len; + uint64_t file_offset = offset; + static char myname[] = "ADIOI_OCEANFS_IOCONTIG"; + char *p_buf = NULL; + ADIO_Offset bytes_xfered = 0; + + MPI_Type_size_x(datatype, &datatype_size); + mem_len = (ADIO_Offset)datatype_size * (ADIO_Offset)count; + + if (file_ptr_type == ADIO_INDIVIDUAL) { + file_offset = fd->fp_ind; + } + + p_buf = (char *)buf; + while (bytes_xfered < mem_len) { + wr_len = ADIOI_MIN(g_io_size_max, mem_len - bytes_xfered); + ret = OCEANFS_IO(io_flag, fd, p_buf, wr_len, file_offset + bytes_xfered); + if (!ret) { + break; + } + if (ret == -1) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", + "**io %s", strerror(errno)); + fd->fp_sys_posn = -1; + return; + } + bytes_xfered += ret; + p_buf += ret; + } + /* Let the application decide how to fail */ + if (ret < 0) { + *error_code = MPI_SUCCESS; + return; + } + + if (file_ptr_type == ADIO_INDIVIDUAL) { + fd->fp_ind += bytes_xfered; + } + fd->fp_sys_posn = file_offset + bytes_xfered; + +#ifdef HAVE_STATUS_SET_BYTES + MPIR_Status_set_bytes(status, datatype, bytes_xfered); +#endif + + *error_code = MPI_SUCCESS; +} + +void ADIOI_OCEANFS_ReadContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + double t = ad_oceanfs_timing_get_time(); + OCEANFS_IOContig(fd, buf, count, datatype, file_ptr_type, offset, status, OCEANFS_READ, error_code); + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_R_CONTIG, t); +} + +void ADIOI_OCEANFS_WriteContig(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + double t = ad_oceanfs_timing_get_time(); + OCEANFS_IOContig(fd, (void *)buf, count, datatype, file_ptr_type, offset, status, OCEANFS_WRITE, error_code); + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_W_CONTIG, t); +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_open.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_open.c new file mode 100644 index 00000000000..8d3d68c1e6f --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_open.c @@ -0,0 +1,164 @@ +#include +#include "ad_env.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "ad_oceanfs_tuning.h" +#include "ad_oceanfs_group_tuning.h" + +int ad_open_GetMode(ADIO_File fd) +{ + int amode = 0; + /* setup the file access mode */ + if (fd->access_mode & ADIO_CREATE) { + amode = amode | O_CREAT; + } + if (fd->access_mode & ADIO_RDONLY) { + amode = amode | O_RDONLY; + } + if (fd->access_mode & ADIO_WRONLY) { + amode = amode | O_WRONLY; + } + if (fd->access_mode & ADIO_RDWR) { + amode = amode | O_RDWR; + } + if (fd->access_mode & ADIO_EXCL) { + amode = amode | O_EXCL; + } + if (fd->access_mode & ADIO_APPEND) { + amode = amode | O_APPEND; + } + /* TO DO */ + return amode; +} + +static uint64_t SyncGroupId(ADIO_File fd, int rank) +{ + int ret; + uint64_t group_id = 0; + if (get_group_lock_enable()) { + if (rank == 0) { + ret = mpi_fs_get_group_id(fd->fd_sys, &group_id); + if (ret < 0) { + group_id = 0; + } + + MPI_Bcast(&group_id, 1, MPI_UNSIGNED_LONG_LONG, 0, fd->comm); + } else { + MPI_Bcast(&group_id, 1, MPI_UNSIGNED_LONG_LONG, 0, fd->comm); + ret = mpi_fs_set_group_id(fd->fd_sys, group_id); + } + } + return group_id; +} + +static int SetupFilePerm(ADIO_File fd) +{ + static const int umask_param = 022; + static const int mask_param = 0666; + mode_t old_mask; + int perm; + if (fd->perm == ADIO_PERM_NULL) { + old_mask = umask(umask_param); + umask(old_mask); + perm = old_mask ^ mask_param; + } else { + perm = fd->perm; + } + return perm; +} + +void ad_open_OpenCheck(ADIO_File fd, int *error_code) +{ + static char myname[] = "ADIOI_OCEANFS_OPEN"; + if ((fd->fd_sys != -1) && ((uint32_t)fd->access_mode & ADIO_APPEND)) { + int ret = mpi_fs_lseek(fd->fd_sys, 0, SEEK_END); + if (ret == -1) { + *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); + ADIOI_OCEANFS_fs *oceanfs_fs = (ADIOI_OCEANFS_fs *)fd->fs_ptr; + ADIOI_Free(oceanfs_fs->context); + ADIOI_Free(oceanfs_fs); + + fd->fs_ptr = NULL; + return; + } + fd->fp_ind = ret; + fd->fp_sys_posn = ret; + } + + *error_code = MPI_SUCCESS; +} + +static void AllocFS(ADIO_File fd, int *error_code) +{ + static char myname[] = "ADIOI_OCEANFS_OPEN"; + ADIOI_OCEANFS_fs *oceanfs_fs = (ADIOI_OCEANFS_fs *)ADIOI_Malloc(sizeof(ADIOI_OCEANFS_fs)); + if (oceanfs_fs == NULL) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, + "Error allocating memory", 0); + return; + } + oceanfs_fs->oceanfs_filename = NULL; + + oceanfs_fs->context = (MPI_CONTEXT_T *)ADIOI_Malloc(sizeof(MPI_CONTEXT_T)); + if (oceanfs_fs->context == NULL) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, + "Error allocating memory", 0); + ADIOI_Free(oceanfs_fs); + return; + } + memset(oceanfs_fs->context, 0, sizeof(MPI_CONTEXT_T)); + + fd->fs_ptr = oceanfs_fs; + *error_code = MPI_SUCCESS; +} + +void ADIOI_OCEANFS_Open(ADIO_File fd, int *error_code) +{ + static char myname[] = "ADIOI_OCEANFS_OPEN"; + int perm, amode, ret, rank; + uint64_t group_id; + + /* validate input args */ + if (!fd) { + *error_code = MPI_ERR_FILE; + return; + } + + /* set internal variables for tuning environment variables */ + ad_oceanfs_get_env_vars(); + + /* setup file permissions */ + perm = SetupFilePerm(fd); + + amode = ad_open_GetMode(fd); + /* init OCEANFS */ + fd->fs_ptr = NULL; + MPI_Comm_rank(fd->comm, &rank); + ADIOI_OCEANFS_Init(rank, error_code); + if (*error_code != MPI_SUCCESS) { + return; + } + + AllocFS(fd, error_code); + if (*error_code != MPI_SUCCESS) { + return; + } + + /* all processes open the file */ + ret = mpi_fs_open(fd->filename, amode, perm); + if (ret < 0) { + *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); + return; + } + + fd->fd_sys = ret; + fd->fd_direct = -1; + + group_id = SyncGroupId(fd, rank); + ((ADIOI_OCEANFS_fs *)(fd->fs_ptr))->context->group_id = group_id; + ad_oceanfs_group_report(fd, group_id); + + ad_open_OpenCheck(fd, error_code); + return; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.c new file mode 100644 index 00000000000..49869aacce1 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.c @@ -0,0 +1,202 @@ +#include "ad_oceanfs_pub.h" + +static int g_gOffsetAggmethod1Size = 3; +static int g_gOffsetAggmethod0Size = 2; + +static int g_gIndexEndOffset = 1; +static int g_gIndexCountSize = 2; + +void AdPubOffsetAggmethod1(MPI_Comm comm, int nprocs, int myrank, ADIO_Offset startOffset, ADIO_Offset endOffset, + ADIO_Offset myCountSize, ADIO_Offset *dstStartOffset, ADIO_Offset *dstEndOffset, ADIO_Offset *dstCountSizes) +{ + ADIO_Offset *oceanfs_offsets0 = (ADIO_Offset *)ADIOI_Malloc(g_gOffsetAggmethod1Size * nprocs * sizeof(ADIO_Offset)); + ADIO_Offset *oceanfs_offsets = (ADIO_Offset *)ADIOI_Malloc(g_gOffsetAggmethod1Size * nprocs * sizeof(ADIO_Offset)); + int i; + for (i = 0; i < g_gOffsetAggmethod1Size * nprocs; i++) { + oceanfs_offsets0[i] = 0; + } + oceanfs_offsets0[myrank * g_gOffsetAggmethod1Size] = startOffset; + oceanfs_offsets0[myrank * g_gOffsetAggmethod1Size + g_gIndexEndOffset] = endOffset; + oceanfs_offsets0[myrank * g_gOffsetAggmethod1Size + g_gIndexCountSize] = myCountSize; + MPI_Allreduce(oceanfs_offsets0, oceanfs_offsets, nprocs * g_gOffsetAggmethod1Size, ADIO_OFFSET, MPI_MAX, comm); + for (i = 0; i < nprocs; i++) { + dstStartOffset[i] = oceanfs_offsets[i * g_gOffsetAggmethod1Size]; + dstEndOffset[i] = oceanfs_offsets[i * g_gOffsetAggmethod1Size + g_gIndexEndOffset]; + dstCountSizes[i] = oceanfs_offsets[i * g_gOffsetAggmethod1Size + g_gIndexCountSize]; + } + ADIOI_Free(oceanfs_offsets0); + ADIOI_Free(oceanfs_offsets); +} + +void AdPubOffsetAggmethod0(MPI_Comm comm, int nprocs, int myrank, ADIO_Offset startOffset, ADIO_Offset endOffset, + ADIO_Offset *dstStartOffset, ADIO_Offset *dstEndOffset) +{ + ADIO_Offset *oceanfs_offsets0 = (ADIO_Offset *)ADIOI_Malloc(g_gOffsetAggmethod0Size * nprocs * sizeof(ADIO_Offset)); + ADIO_Offset *oceanfs_offsets = (ADIO_Offset *)ADIOI_Malloc(g_gOffsetAggmethod0Size * nprocs * sizeof(ADIO_Offset)); + int i; + for (i = 0; i < g_gOffsetAggmethod0Size * nprocs; i++) { + oceanfs_offsets0[i] = 0; + } + oceanfs_offsets0[myrank * g_gOffsetAggmethod0Size] = startOffset; + oceanfs_offsets0[myrank * g_gOffsetAggmethod0Size + g_gIndexEndOffset] = endOffset; + MPI_Allreduce(oceanfs_offsets0, oceanfs_offsets, nprocs * g_gOffsetAggmethod0Size, ADIO_OFFSET, MPI_MAX, comm); + for (i = 0; i < nprocs; i++) { + dstStartOffset[i] = oceanfs_offsets[i * g_gOffsetAggmethod0Size]; + dstEndOffset[i] = oceanfs_offsets[i * g_gOffsetAggmethod0Size + g_gIndexEndOffset]; + } + ADIOI_Free(oceanfs_offsets0); + ADIOI_Free(oceanfs_offsets); +} + +void CheckOffsetAndLen(void** recvBufForOffsets, void** recvBufForLens) +{ + /* If no recv buffer was allocated in the loop above, make it NULL */ + if (*recvBufForOffsets == (void *)0xFFFFFFFFFFFFFFFF) { + *recvBufForOffsets = NULL; + } + if (*recvBufForLens == (void *)0xFFFFFFFFFFFFFFFF) { + *recvBufForLens = NULL; + } +} + +void AllocAccess(ADIOI_Access *my_req, int nprocs, int *my_req_per_proc, int* count_procs_ptr) +{ + int i; + *count_procs_ptr = 0; + for (i = 0; i < nprocs; i++) { + if (my_req_per_proc[i]) { + my_req[i].offsets = (ADIO_Offset *)ADIOI_Malloc(my_req_per_proc[i] * sizeof(ADIO_Offset)); + my_req[i].lens = (ADIO_Offset *)ADIOI_Malloc(my_req_per_proc[i] * sizeof(ADIO_Offset)); + (*count_procs_ptr)++; + } + my_req[i].count = 0; /* will be incremented where needed later */ + } +} + +void AllocOtherReq(int nprocs, int *others_req_per_proc, void** recvBufForOffsets, + void** recvBufForLens, ADIOI_Access *others_req, int* others_req_procs_ptr) +{ + int i; + *others_req_procs_ptr = 0; + + for (i = 0; i < nprocs; i++) { + if (others_req_per_proc[i]) { + others_req[i].count = others_req_per_proc[i]; + + others_req[i].offsets = (ADIO_Offset *)ADIOI_Malloc(others_req_per_proc[i] * sizeof(ADIO_Offset)); + others_req[i].lens = (ADIO_Offset *)ADIOI_Malloc(others_req_per_proc[i] * sizeof(ADIO_Offset)); + + if ((MPIU_Upint)others_req[i].offsets < (MPIU_Upint)*recvBufForOffsets) { + *recvBufForOffsets = others_req[i].offsets; + } + if ((MPIU_Upint)others_req[i].lens < (MPIU_Upint)*recvBufForLens) { + *recvBufForLens = others_req[i].lens; + } + + others_req[i].mem_ptrs = (MPI_Aint *)ADIOI_Malloc(others_req_per_proc[i] * sizeof(MPI_Aint)); + + (*others_req_procs_ptr)++; + } else { + others_req[i].count = 0; + others_req[i].offsets = NULL; + others_req[i].lens = NULL; + } + } + + CheckOffsetAndLen(recvBufForOffsets, recvBufForLens); +} + +void FreeAccess(ADIOI_Access *acc, int nprocs) +{ + int i; + for (i = 0; i < nprocs; i++) { + if (acc[i].count) { + ADIOI_Free(acc[i].offsets); + ADIOI_Free(acc[i].lens); + } + } + ADIOI_Free(acc); +} + +void FreeAccessAll(ADIOI_Access *acc, int nprocs) +{ + int i; + for (i = 0; i < nprocs; i++) { + if (acc[i].count) { + ADIOI_Free(acc[i].offsets); + ADIOI_Free(acc[i].lens); + ADIOI_Free(acc[i].mem_ptrs); + } + } + ADIOI_Free(acc); +} + +int CalcCount(int* array, int nprocs) +{ + int i; + int cnt = 0; + for (i = 0; i < nprocs; i++) { + if (array[i]) { + cnt++; + } + } + + return cnt; +} + +void CalcLoc(ADIOI_Access *others_req, int nprocs, ADIO_Offset* st_loc, ADIO_Offset* end_loc) +{ + int i, j; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + *st_loc = others_req[i].offsets[0]; + *end_loc = others_req[i].offsets[0]; + break; + } + } + + for (i = 0; i < nprocs; i++) { + for (j = 0; j < others_req[i].count; j++) { + *st_loc = ADIOI_MIN(*st_loc, others_req[i].offsets[j]); + *end_loc = ADIOI_MAX(*end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); + } + } +} + +void SetNtimes(ADIOI_Iread_and_exch_vars *vars, ADIO_Offset st_loc, ADIO_Offset end_loc, int coll_bufsize) +{ + /* calculate ntimes, the number of times this process must perform I/O + * operations in order to complete all the requests it has received. + * the need for multiple I/O operations comes from the restriction that + * we only use coll_bufsize bytes of memory for internal buffering. + */ + if ((st_loc == -1 && end_loc == -1) || coll_bufsize == 0) { + /* this process does no I/O. */ + vars->ntimes = 0; + } else { + /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */ + vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize); + } +} + +void SetNtimesLocal(int *ntimes, ADIO_Offset st_loc, ADIO_Offset end_loc, int coll_bufsize) +{ + if ((st_loc == -1 && end_loc == -1) || coll_bufsize == 0) { + /* this process does no I/O. */ + *ntimes = 0; + } else { + /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */ + *ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize); + } +} + +ADIOI_Flatlist_node* OCEANFS_Flatten_and_find(MPI_Datatype type) +{ + ADIOI_Flatlist_node* flat_file = ADIOI_Flatlist; + while (flat_file && flat_file->type != type) { + flat_file = flat_file->next; + } + + return flat_file; +} + diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.h new file mode 100644 index 00000000000..52eb167285b --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_pub.h @@ -0,0 +1,249 @@ +#ifndef AD_OCEANFS_PUB_H +#define AD_OCEANFS_PUB_H + +#include "adio.h" +#include "adio_extern.h" + +#define OCEANFS_Int int +#define OCEANFS_Delete_flattened(type) ADIOI_Delete_flattened(type) +ADIOI_Flatlist_node* OCEANFS_Flatten_and_find(MPI_Datatype type); +#define OCEANFS_UNREFERENCED_ARG(arg) ADIOI_UNREFERENCED_ARG(arg) + +#ifdef AGGREGATION_PROFILE +#define OCEANFS_Log_event(id) MPE_Log_event((id), 0, NULL) +#else +#define OCEANFS_Log_event(id) +#endif + +#define MUL_2 (2) +#define MUL_100 (100) + +#define FreeAdioiTwo(a, b) \ + do { \ + ADIOI_Free(a); \ + ADIOI_Free(b); \ + } while (0) + +#define FreeAdioiThree(a, b, c) \ + do { \ + ADIOI_Free(a); \ + ADIOI_Free(b); \ + ADIOI_Free(c); \ + } while (0) + +#define FreeAdioiFour(a, b, c, d) \ + do { \ + ADIOI_Free(a); \ + ADIOI_Free(b); \ + ADIOI_Free(c); \ + ADIOI_Free(d); \ + } while (0) + +#define FreeAdioiFive(a, b, c, d, e) \ + do { \ + ADIOI_Free(a); \ + ADIOI_Free(b); \ + ADIOI_Free(c); \ + ADIOI_Free(d); \ + ADIOI_Free(e); \ + } while (0) + +#define AD_COLL_BUF_INCR \ +{ \ + while (buf_incr) { \ + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) \ + flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = \ + flat_buf->indices[flat_buf_idx] + (ADIO_Offset)n_buftypes * (ADIO_Offset)buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + buf_incr -= size_in_buf; \ + } \ +} + +enum MPE_Log_event_ID { + MPE_Log_ID_7 = 7, + MPE_Log_ID_8 = 8, + MPE_Log_ID_13 = 13, + MPE_Log_ID_14 = 14, + MPE_Log_ID_5004 = 5004, + MPE_Log_ID_5005 = 5005, + MPE_Log_ID_5012 = 5012, + MPE_Log_ID_5013 = 5013, + MPE_Log_ID_5024 = 5024, + MPE_Log_ID_5025 = 5025, + MPE_Log_ID_5026 = 5026, + MPE_Log_ID_5027 = 5027, + MPE_Log_ID_5032 = 5032, + MPE_Log_ID_5033 = 5033 +}; + +enum GET_Oceanfsmpio_Read_Aggmethod_VALUE { + OCEANFS_1 = 1, + OCEANFS_2 = 2 +}; + +/* ADIOI_GEN_IreadStridedColl */ +struct ADIOI_GEN_IreadStridedColl_vars { + /* requests */ + MPI_Request req_offset[2]; /* ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL */ + MPI_Request req_ind_io; /* ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO */ + + /* parameters */ + ADIO_File fd; + void *buf; + int count; + MPI_Datatype datatype; + int file_ptr_type; + ADIO_Offset offset; + + /* stack variables */ + ADIOI_Access *my_req; + /* array of nprocs structures, one for each other process in + whose file domain this process's request lies */ + + ADIOI_Access *others_req; + /* array of nprocs structures, one for each other process + whose request lies in this process's file domain. */ + + int nprocs; + int nprocs_for_coll; + int myrank; + int contig_access_count; + int interleave_count; + int buftype_is_contig; + int *count_my_req_per_proc; + int count_my_req_procs; + int count_others_req_procs; + ADIO_Offset start_offset; + ADIO_Offset end_offset; + ADIO_Offset orig_fp; + ADIO_Offset fd_size; + ADIO_Offset min_st_offset; + ADIO_Offset *offset_list; + ADIO_Offset *st_offsets; + ADIO_Offset *fd_start; + ADIO_Offset *fd_end; + ADIO_Offset *end_offsets; + ADIO_Offset *len_list; + int *buf_idx; +}; + +/* ADIOI_Iread_and_exch */ +struct ADIOI_Iread_and_exch_vars { + /* requests */ + MPI_Request req1; /* ADIOI_IRC_STATE_IREAD_AND_EXCH */ + MPI_Request req2; /* ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN */ + + /* parameters */ + ADIO_File fd; + void *buf; + MPI_Datatype datatype; + int nprocs; + int myrank; + ADIOI_Access *others_req; + ADIO_Offset *offset_list; + ADIO_Offset *len_list; + int contig_access_count; + ADIO_Offset min_st_offset; + ADIO_Offset fd_size; + ADIO_Offset *fd_start; + ADIO_Offset *fd_end; + int *buf_idx; + + /* stack variables */ + int m; + int ntimes; + int max_ntimes; + int buftype_is_contig; + ADIO_Offset st_loc; + ADIO_Offset end_loc; + ADIO_Offset off; + ADIO_Offset done; + char *read_buf; + int *curr_offlen_ptr; + int *count; + int *send_size; + int *recv_size; + int *partial_send; + int *recd_from_proc; + int *start_pos; + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + ADIO_Offset size; + ADIO_Offset real_size; + ADIO_Offset for_curr_iter; + ADIO_Offset for_next_iter; + ADIOI_Flatlist_node *flat_buf; + MPI_Aint buftype_extent; + int coll_bufsize; + + /* next function to be called */ + void (*next_fn)(ADIOI_NBC_Request *, int *); +}; + +/* ADIOI_R_Iexchange_data */ +struct ADIOI_R_Iexchange_data_vars { + /* requests */ + MPI_Request req1; /* ADIOI_IRC_STATE_R_IEXCHANGE_DATA */ + MPI_Request *req2; /* ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV & FILL */ + + /* parameters */ + ADIO_File fd; + void *buf; + ADIOI_Flatlist_node *flat_buf; + ADIO_Offset *offset_list; + ADIO_Offset *len_list; + int *send_size; + int *recv_size; + int *count; + int *start_pos; + int *partial_send; + int *recd_from_proc; + int nprocs; + int myrank; + int buftype_is_contig; + int contig_access_count; + ADIO_Offset min_st_offset; + ADIO_Offset fd_size; + ADIO_Offset *fd_start; + ADIO_Offset *fd_end; + ADIOI_Access *others_req; + int iter; + MPI_Aint buftype_extent; + int *buf_idx; + + /* stack variables */ + int nprocs_recv; + int nprocs_send; + char **recv_buf; + + /* next function to be called */ + void (*next_fn)(ADIOI_NBC_Request *, int *); +}; + +void AdPubOffsetAggmethod1(MPI_Comm comm, int nprocs, int myrank, ADIO_Offset startOffset, ADIO_Offset endOffset, + ADIO_Offset myCountSize, ADIO_Offset *dstStartOffset, ADIO_Offset *dstEndOffset, ADIO_Offset *dstCountSizes); +void AdPubOffsetAggmethod0(MPI_Comm comm, int nprocs, int myrank, ADIO_Offset startOffset, ADIO_Offset endOffset, + ADIO_Offset *dstStartOffset, ADIO_Offset *dstEndOffset); + +void CheckOffsetAndLen(void** recvBufForOffsets, void** recvBufForLens); +void AllocAccess(ADIOI_Access *my_req, int nprocs, int *my_req_per_proc, int* count_procs_ptr); +void AllocOtherReq(int nprocs, int *others_req_per_proc, void** recvBufForOffsets, + void** recvBufForLens, ADIOI_Access *others_req, int* others_req_procs_ptr); +void FreeAccess(ADIOI_Access *acc, int nprocs); +void FreeAccessAll(ADIOI_Access *acc, int nprocs); + +int CalcCount(int* array, int nprocs); +void CalcLoc(ADIOI_Access *others_req, int nprocs, ADIO_Offset* st_loc, ADIO_Offset* end_loc); +void SetNtimes(ADIOI_Iread_and_exch_vars *vars, ADIO_Offset st_loc, ADIO_Offset end_loc, int coll_bufsize); +void SetNtimesLocal(int *ntimes, ADIO_Offset st_loc, ADIO_Offset end_loc, int coll_bufsize); + +#endif /* AD_OCEANFS_PUB_H */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdcoll.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdcoll.c new file mode 100644 index 00000000000..afff7ce455c --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdcoll.c @@ -0,0 +1,1015 @@ +#include +#include +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_aggrs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "ad_oceanfs_group_tuning.h" + +#ifdef PROFILE +#include "mpe.h" +#endif + +#ifdef AGGREGATION_PROFILE +#include "mpe.h" +#endif + +#ifdef USE_DBG_LOGGING +#define RDCOLL_DEBUG 1 +#endif + +/* prototypes of functions used for collective reads only. */ +static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, + ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, OCEANFS_Int *buf_idx, + int *error_code); +static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, + ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, int *partial_send, + int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx); +static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, + int *partial_send, int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx); +static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, char **recv_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, unsigned *recv_size, MPI_Request *requests, MPI_Status *statuses, + int *recd_from_proc, int nprocs, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, MPI_Aint buftype_extent); + +static void ADIOI_OCEANFS_oldReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + /* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. */ + + ADIOI_Access *my_req = NULL; + /* array of nprocs structures, one for each other process in + whose file domain this process's request lies */ + + ADIOI_Access *others_req = NULL; + /* array of nprocs structures, one for each other process + whose request lies in this process's file domain. */ + + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; + int contig_access_count = 0; + int interleave_count = 0; + int buftype_is_contig; + int *count_my_req_per_proc = NULL; + OCEANFS_Int *buf_idx = NULL; + int count_my_req_procs, count_others_req_procs; + ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; + ADIO_Offset *offset_list = NULL; + ADIO_Offset *st_offsets = NULL; + ADIO_Offset *fd_start = NULL; + ADIO_Offset *fd_end = NULL; + ADIO_Offset *end_offsets = NULL; + ADIO_Offset *count_sizes = NULL; + ADIO_Offset *len_list = NULL; + +#ifdef HAVE_STATUS_SET_BYTES + MPI_Count bufsize, size; +#endif + + /* From common code - implemented for oceanfs? */ + if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { + ADIOI_IOStridedColl(fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, status, error_code); + + /* group lock, flush data */ + MPI_Barrier(fd->comm); + ADIO_Flush(fd, error_code); + return; + } + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_13, 0, "start computation"); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* number of aggregators, cb_nodes, is stored in the hints */ + nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* only check for interleaving if cb_read isn't disabled */ + if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ + + /* Note: end_offset points to the last byte-offset that will be accessed. + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ + + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, + &end_offset, &contig_access_count); + + /* each process communicates its start and end offsets to other + processes. The result is an array each of start and end offsets + stored in order of process rank. */ + st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + + ADIO_Offset my_count_size = 0; + /* One-sided aggregation needs the amount of data per rank as well because the difference in + * starting and ending offsets for 1 byte is 0 the same as 0 bytes so it cannot be distiguished. + */ + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + count_sizes = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + MPI_Count buftype_size; + MPI_Type_size_x(datatype, &buftype_size); + my_count_size = (ADIO_Offset)count * (ADIO_Offset)buftype_size; + } + if (get_oceanfsmpio_tunegather()) { + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + AdPubOffsetAggmethod1(fd->comm, nprocs, myrank, start_offset, end_offset, my_count_size, st_offsets, end_offsets, count_sizes); + } else { + AdPubOffsetAggmethod0(fd->comm, nprocs, myrank, start_offset, end_offset, st_offsets, end_offsets); + } + } else { + MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); + MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1, ADIO_OFFSET, fd->comm); + } + } + + /* are the accesses of different processes interleaved? */ + /* This is a rudimentary check for interleaving, but should suffice + for the moment. */ + for (i = 1; i < nprocs; i++) { + if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i])) { + interleave_count++; + } + } + } + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + + if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { + /* don't do aggregation */ + if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { + FreeAdioiFour(offset_list, len_list, st_offsets, end_offsets); + } + + fd->fp_ind = orig_fp; + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + + if (buftype_is_contig && filetype_is_contig) { + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { + off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; + ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); + } else { + ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); + } + } else { + ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } + + return; + } + + /* We're going to perform aggregation of I/O. Here we call + * ADIOI_Calc_file_domains() to determine what processes will handle I/O + * to what regions. We pass nprocs_for_coll into this function; it is + * used to determine how many processes will perform I/O, which is also + * the number of regions into which the range of bytes must be divided. + * These regions are called "file domains", or FDs. + * + * When this function returns, fd_start, fd_end, fd_size, and + * min_st_offset will be filled in. fd_start holds the starting byte + * location for each file domain. fd_end holds the ending byte location. + * min_st_offset holds the minimum byte location that will be accessed. + * + * Both fd_start[] and fd_end[] are indexed by an aggregator number; this + * needs to be mapped to an actual rank in the communicator later. + * + */ + int currentNonZeroDataIndex = 0; + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + /* Take out the 0-data offsets by shifting the indexes with data to the + * front and keeping track of the non-zero data index for use as the + * length. By doing this we will optimally use all available aggs + * and spread the actual data across them instead of having offsets + * with empty data potentially dilute the file domains and create + * problems for the one-sided aggregation. + */ + for (i = 0; i < nprocs; i++) { + if (count_sizes[i] > 0) { + st_offsets[currentNonZeroDataIndex] = st_offsets[i]; + end_offsets[currentNonZeroDataIndex] = end_offsets[i]; + currentNonZeroDataIndex++; + } + } + } + if (get_oceanfsmpio_tuneblocking()) { + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + ADIOI_OCEANFS_Calc_file_domains(fd, st_offsets, end_offsets, currentNonZeroDataIndex, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, fd->fs_ptr); + } else { + ADIOI_OCEANFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, + &fd_end, &fd_size, fd->fs_ptr); + } + } else { + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + ADIOI_Calc_file_domains(st_offsets, end_offsets, currentNonZeroDataIndex, nprocs_for_coll, &min_st_offset, + &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); + } else { + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, + &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); + } + } + if ((get_oceanfsmpio_read_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_read_aggmethod() == OCEANFS_2)) { + /* If the user has specified to use a one-sided aggregation method then do that at + * this point instead of the two-phase I/O. + */ + ADIOI_OneSidedReadAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, + st_offsets, end_offsets, currentNonZeroDataIndex, fd_start, fd_end); + FreeAdioiFour(offset_list, len_list, st_offsets, end_offsets); + FreeAdioiThree(fd_start, fd_end, count_sizes); + goto fn_exit; + } + if (get_oceanfsmpio_p2pcontig() == 1) { + /* For some simple yet common(?) workloads, full-on two-phase I/O is + * overkill. We can establish sub-groups of processes and their + * aggregator, and then these sub-groups will carry out a simplified + * two-phase over that sub-group. + * + * First verify that the filetype is contig and the offsets are + * increasing in rank order */ + int x; + int inOrderAndNoGaps = 1; + for (x = 0; x < (nprocs - 1); x++) { + if (end_offsets[x] != (st_offsets[x + 1] - 1)) { + inOrderAndNoGaps = 0; + } + } + if (inOrderAndNoGaps && buftype_is_contig) { + /* if these conditions exist then execute the P2PContig code else + * execute the original code */ + ADIOI_P2PContigReadAggregation(fd, buf, error_code, st_offsets, end_offsets, fd_start, fd_end); + + /* NOTE: we are skipping the rest of two-phase in this path */ + FreeAdioiFour(offset_list, len_list, st_offsets, end_offsets); + FreeAdioiTwo(fd_start, fd_end); + goto fn_exit; + } + } + + /* calculate where the portions of the access requests of this process + * are located in terms of the file domains. this could be on the same + * process or on other processes. this function fills in: + * count_my_req_procs - number of processes (including this one) for which + * this process has requests in their file domain + * count_my_req_per_proc - count of requests for each process, indexed + * by rank of the process + * my_req[] - array of data structures describing the requests to be + * performed by each process (including self). indexed by rank. + * buf_idx[] - array of locations into which data can be directly moved; + * this is only valid for contiguous buffer case + */ + if (get_oceanfsmpio_tuneblocking()) { + ADIOI_OCEANFS_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, + nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); + } else { + ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, + nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); + } + + /* perform a collective communication in order to distribute the + * data calculated above. fills in the following: + * count_others_req_procs - number of processes (including this + * one) which have requests in this process's file domain. + * count_others_req_per_proc[] - number of separate contiguous + * requests from proc i lie in this process's file domain. + */ + if (get_oceanfsmpio_tuneblocking()) { + ADIOI_OCEANFS_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, + &count_others_req_procs, &others_req); + } else { + ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, + &count_others_req_procs, &others_req); + } + + /* my_req[] and count_my_req_per_proc aren't needed at this point, so + * let's free the memory + */ + ADIOI_Free(count_my_req_per_proc); + FreeAccess(my_req, nprocs); + + /* read data in sizes of no more than ADIOI_Coll_bufsize, + * communicate, and fill user buf. + */ + ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); + + + if (!buftype_is_contig) { + OCEANFS_Delete_flattened(datatype); + } + + /* free all memory allocated for collective I/O */ + FreeAccessAll(others_req, nprocs); + + FreeAdioiThree(buf_idx, fd_start, fd_end); + FreeAdioiFour(offset_list, len_list, st_offsets, end_offsets); + +fn_exit: +#ifdef HAVE_STATUS_SET_BYTES + MPI_Type_size_x(datatype, &size); + bufsize = size * count; + MPIR_Status_set_bytes(status, datatype, bufsize); + /* This is a temporary way of filling in status. The right way is to + keep track of how much data was actually read and placed in buf + during collective I/O. */ +#endif + + fd->fp_sys_posn = -1; /* set it to null. */ + /* group lock, flush data */ + MPI_Barrier(fd->comm); + ADIO_Flush(fd, error_code); +} + +static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, + ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, OCEANFS_Int *buf_idx, + int *error_code) +{ + /* Read in sizes of no more than coll_bufsize, an info parameter. + Send data to appropriate processes. + Place recd. data in user buf. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were read all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to read a distributed + array from a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + int i, j, m, ntimes, max_ntimes, buftype_is_contig; + ADIO_Offset st_loc = -1; + ADIO_Offset end_loc = -1; + ADIO_Offset off, done, real_off, req_off; + char *read_buf = NULL; + char *tmp_buf = NULL; + int *curr_offlen_ptr = NULL; + int *count = NULL; + int *send_size = NULL; + int *recv_size = NULL; + int *partial_send = NULL; + int *recd_from_proc = NULL; + int *start_pos = NULL; + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + ADIO_Offset real_size, size, for_curr_iter, for_next_iter; + int req_len, flag, rank; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf = NULL; + MPI_Aint buftype_extent; + int coll_bufsize; +#ifdef RDCOLL_DEBUG + int iii; +#endif + *error_code = MPI_SUCCESS; /* changed below if error */ + /* only I/O errors are currently reported */ + + /* calculate the number of reads of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. + coll_bufsize is obtained from the hints object. */ + + coll_bufsize = fd->hints->cb_buffer_size; + + /* grab some initial values for st_loc and end_loc */ + CalcLoc(others_req, nprocs, &st_loc, &end_loc); + + /* calculate ntimes, the number of times this process must perform I/O + * operations in order to complete all the requests it has received. + * the need for multiple I/O operations comes from the restriction that + * we only use coll_bufsize bytes of memory for internal buffering. + */ + SetNtimesLocal(&ntimes, st_loc, end_loc, coll_bufsize); + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + + read_buf = fd->io_buf; + + curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* its use is explained below. calloc initializes to 0. */ + + count = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* to store count of how many off-len pairs per proc are satisfied + in an iteration. */ + + partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* if only a portion of the last off-len pair is sent to a process + in a particular iteration, the length sent is stored here. + calloc initializes to 0. */ + + send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be sent to each proc. in an iteration */ + + recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be recd. from each proc. in an iteration. + Of size nprocs so that I can use MPI_Alltoall later. */ + + recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* amount of data recd. so far from each proc. Used in + ADIOI_Fill_user_buffer. initialized to 0 here. */ + + start_pos = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* used to store the starting value of curr_offlen_ptr[i] in + this iteration */ + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + if (!buftype_is_contig) { + flat_buf = ADIOI_Flatten_and_find(datatype); + } + MPI_Type_extent(datatype, &buftype_extent); + + done = 0; + off = st_loc; + for_curr_iter = for_next_iter = 0; + + MPI_Comm_rank(fd->comm, &rank); + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_14, 0, "end computation"); +#endif + + for (m = 0; m < ntimes; m++) { + /* read buf of size coll_bufsize (or less) */ + /* go through all others_req and check if any are satisfied + by the current read */ + + /* since MPI guarantees that displacements in filetypes are in + monotonically nondecreasing order, I can maintain a pointer + (curr_offlen_ptr) to + current off-len pair for each process in others_req and scan + further only from there. There is still a problem of filetypes + such as: (1, 2, 3 are not process nos. They are just numbers for + three chunks of data, specified by a filetype.) + + 1 -------!-- + 2 -----!---- + 3 --!----- + + where ! indicates where the current read_size limitation cuts + through the filetype. I resolve this by reading up to !, but + filling the communication buffer only for 1. I copy the portion + left over for 2 into a tmp_buf for use in the next + iteration. i.e., 2 and 3 will be satisfied in the next + iteration. This simplifies filling in the user's buf at the + other end, as only one off-len pair with incomplete data + will be sent. I also don't need to send the individual + offsets and lens along with the data, as the data is being + sent in a particular order. */ + + /* off = start offset in the file for the data actually read in + this iteration + size = size of data read corresponding to off + real_off = off minus whatever data was retained in memory from + previous iteration for cases like 2, 3 illustrated above + real_size = size plus the extra corresponding to real_off + req_off = off in file for a particular contiguous request + minus what was satisfied in previous iteration + req_size = size corresponding to req_off */ + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_13, 0, "start computation"); +#endif + size = ADIOI_MIN((unsigned)coll_bufsize, end_loc - st_loc + 1 - done); + real_off = off - for_curr_iter; + real_size = size + for_curr_iter; + + for (i = 0; i < nprocs; i++) { + count[i] = send_size[i] = 0; + } + for_next_iter = 0; + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + if (partial_send[i]) { + /* this request may have been partially + satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_send[i]; + req_len = others_req[i].lens[j] - partial_send[i]; + partial_send[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < real_off + real_size) { + count[i]++; + ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)read_buf) + req_off - real_off) == + (ADIO_Offset)(MPIU_Upint)(read_buf + req_off - real_off)); + MPI_Address(read_buf + req_off - real_off, &(others_req[i].mem_ptrs[j])); + ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); + send_size[i] += + (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); + + if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) { + partial_send[i] = (int)(real_off + real_size - req_off); + if ((j + 1 < others_req[i].count) && + (others_req[i].offsets[j + 1] < real_off + real_size)) { + /* this is the case illustrated in the + figure above. */ + for_next_iter = + ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j + 1]); + /* max because it must cover requests + from different processes */ + } + break; + } + } else + break; + } + curr_offlen_ptr[i] = j; + } + } + + flag = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + flag = 1; + break; + } + } + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_14, 0, "end computation"); +#endif + if (flag) { + char round[50]; + sprintf(round, "two-phase-round=%d", m); + setenv("LIBIOLOG_EXTRA_INFO", round, 1); + ADIOI_Assert(size == (int)size); + ADIO_ReadContig(fd, read_buf + for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, + error_code); + + if (*error_code != MPI_SUCCESS) { + return; + } + } + + for_curr_iter = for_next_iter; + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_7, 0, "start communication"); +#endif + if (get_oceanfsmpio_comm() == 1) { + ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, + partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, + fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); + } else if (get_oceanfsmpio_comm() == 0) { + ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); + } + + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_8, 0, "end communication"); +#endif + + if (for_next_iter) { + tmp_buf = (char *)ADIOI_Malloc(for_next_iter); + ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)read_buf) + real_size - for_next_iter) == + (ADIO_Offset)(MPIU_Upint)(read_buf + real_size - for_next_iter)); + ADIOI_Assert((for_next_iter + coll_bufsize) == (size_t)(for_next_iter + coll_bufsize)); + memcpy(tmp_buf, read_buf + real_size - for_next_iter, for_next_iter); + ADIOI_Free(fd->io_buf); + fd->io_buf = (char *)ADIOI_Malloc(for_next_iter + coll_bufsize); + memcpy(fd->io_buf, tmp_buf, for_next_iter); + read_buf = fd->io_buf; + ADIOI_Free(tmp_buf); + } + + off += size; + done += size; + } + + for (i = 0; i < nprocs; i++) { + count[i] = send_size[i] = 0; + } +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_7, 0, "start communication"); +#endif + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to send, but check for recv. */ + if (get_oceanfsmpio_comm() == 1) { + ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, + partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, + fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); + } else if (get_oeanfsmpio_comm() == 0) { + ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); + } + } +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_8, 0, "end communication"); +#endif + + FreeAdioiThree(curr_offlen_ptr, count, partial_send); + FreeAdioiFour(send_size, recv_size, recd_from_proc, start_pos); + + unsetenv("LIBIOLOG_EXTRA_INFO"); +} + +static void BufRecv(ADIO_File fd, int nprocs, int buftype_is_contig, MPI_Request *requests, + char*** recv_buf, int* recv_size, void* buf, OCEANFS_Int* buf_idx, int myrank, int iter) +{ + int i, j; + if (buftype_is_contig) { + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + MPI_Irecv(((char *)buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank + i + MUL_100 * iter, fd->comm, + requests + j); + j++; + buf_idx[i] += recv_size[i]; + } + } + } else { + /* allocate memory for recv_buf and post receives */ + *recv_buf = (char **)ADIOI_Malloc(nprocs * sizeof(char *)); + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + (*recv_buf)[i] = (char *)ADIOI_Malloc(recv_size[i]); + } + } + + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + MPI_Irecv((*recv_buf)[i], recv_size[i], MPI_BYTE, i, + myrank + i + MUL_100 * iter, fd->comm, requests + j); + j++; + } + } + } +} + +static void RecvBufFree(int nprocs, int buftype_is_contig, char** recv_buf, int* recv_size) +{ + if (buftype_is_contig) { + return; + } + + int i; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + ADIOI_Free(recv_buf[i]); + } + } + ADIOI_Free(recv_buf); +} + +static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, + ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, int *partial_send, + int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx) +{ + int i, j, nprocs_recv, nprocs_send; + int k = 0; + int tmp = 0; + char **recv_buf = NULL; + MPI_Request *requests = NULL; + MPI_Datatype send_type; + MPI_Status *statuses = NULL; + + /* exchange send_size info so that each process knows how much to + receive from whom and how much memory to allocate. */ + + MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm); + + nprocs_recv = CalcCount(recv_size, nprocs); + nprocs_send = CalcCount(send_size, nprocs); + + requests = (MPI_Request *)ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); + /* +1 to avoid a 0-size malloc */ + + /* post recvs. if buftype_is_contig, data can be directly recd. into + user buf at location given by buf_idx. else use recv_buf. */ + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5032, 0, NULL); +#endif + + BufRecv(fd, nprocs, buftype_is_contig, requests, &recv_buf, recv_size, buf, buf_idx, myrank, iter); + + /* create derived datatypes and send data */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + /* take care if the last off-len pair is a partial send */ + if (partial_send[i]) { + k = start_pos[i] + count[i] - 1; + tmp = others_req[i].lens[k]; + others_req[i].lens[k] = partial_send[i]; + } + ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); + /* absolute displacement; use MPI_BOTTOM in send */ + MPI_Type_commit(&send_type); + MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank + i + MUL_100 * iter, fd->comm, requests + nprocs_recv + j); + MPI_Type_free(&send_type); + if (partial_send[i]) + others_req[i].lens[k] = tmp; + j++; + } + } + + statuses = (MPI_Status *)ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + + /* wait on the receives */ + if (nprocs_recv) { +#ifdef NEEDS_MPI_TEST + j = 0; + while (!j) { + MPI_Testall(nprocs_recv, requests, &j, statuses); + } +#else + MPI_Waitall(nprocs_recv, requests, statuses); +#endif + + /* if noncontiguous, to the copies from the recv buffers */ + if (!buftype_is_contig) { + ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned *)recv_size, requests, + statuses, recd_from_proc, nprocs, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, + buftype_extent); + } + } + + /* wait on the sends */ + MPI_Waitall(nprocs_send, requests + nprocs_recv, statuses + nprocs_recv); + + FreeAdioiTwo(statuses, requests); + RecvBufFree(nprocs, buftype_is_contig, recv_buf, recv_size); +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5033, 0, NULL); +#endif +} + +#define ADIOI_RD_BUF_COPY \ + { \ + while (size) { \ + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ + ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + user_buf_idx) == \ + (ADIO_Offset)(MPIU_Upint)(buf + user_buf_idx)); \ + ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \ + int ret = 0; \ + memcpy(((char *)buf) + user_buf_idx, &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \ + recv_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) \ + flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = \ + flat_buf->indices[flat_buf_idx] + (ADIO_Offset)n_buftypes * (ADIO_Offset)buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + } \ + AD_COLL_BUF_INCR \ + } + +#define ADIOI_RD_BUF \ + { \ + if (recv_buf_idx[p] < recv_size[p]) { \ + if (curr_from_proc[p] + len > done_from_proc[p]) { \ + if (done_from_proc[p] > curr_from_proc[p]) { \ + size = ADIOI_MIN(curr_from_proc[p] + len - done_from_proc[p], recv_size[p] - recv_buf_idx[p]); \ + buf_incr = done_from_proc[p] - curr_from_proc[p]; \ + AD_COLL_BUF_INCR \ + buf_incr = curr_from_proc[p] + len - done_from_proc[p]; \ + ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size)); \ + curr_from_proc[p] = done_from_proc[p] + size; \ + ADIOI_RD_BUF_COPY \ + } else { \ + size = ADIOI_MIN(len, recv_size[p] - recv_buf_idx[p]); \ + buf_incr = len; \ + ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size)); \ + curr_from_proc[p] += (unsigned)size; \ + ADIOI_RD_BUF_COPY \ + } \ + } else { \ + ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len)); \ + curr_from_proc[p] += (unsigned)len; \ + buf_incr = len; \ + AD_COLL_BUF_INCR \ + } \ + } else { \ + buf_incr = len; \ + AD_COLL_BUF_INCR \ + } \ + } + +static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, char **recv_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, unsigned *recv_size, MPI_Request *requests, MPI_Status *statuses, + int *recd_from_proc, int nprocs, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, MPI_Aint buftype_extent) +{ + /* this function is only called if buftype is not contig */ + int i, p; + ADIO_Offset off, len, rem_len; + /* Not sure unsigned is necessary, but it makes the math safer */ + unsigned *curr_from_proc = NULL; + unsigned *done_from_proc = NULL; + unsigned *recv_buf_idx = NULL; + + ADIO_Offset flat_buf_sz, size_in_buf; + ADIO_Offset size, buf_incr; + ADIO_Offset user_buf_idx; + int flat_buf_idx; + int n_buftypes; + user_buf_idx = flat_buf->indices[0]; + flat_buf_idx = 0; + n_buftypes = 0; + flat_buf_sz = flat_buf->blocklens[0]; + + OCEANFS_UNREFERENCED_ARG(requests); + OCEANFS_UNREFERENCED_ARG(statuses); + + /* curr_from_proc[p] = amount of data recd from proc. p that has already + been accounted for so far + done_from_proc[p] = amount of data already recd from proc. p and + filled into user buffer in previous iterations + user_buf_idx = current location in user buffer + recv_buf_idx[p] = current location in recv_buf of proc. p */ + curr_from_proc = (unsigned *)ADIOI_Malloc(nprocs * sizeof(unsigned)); + done_from_proc = (unsigned *)ADIOI_Malloc(nprocs * sizeof(unsigned)); + recv_buf_idx = (unsigned *)ADIOI_Malloc(nprocs * sizeof(unsigned)); + + for (i = 0; i < nprocs; i++) { + recv_buf_idx[i] = curr_from_proc[i] = 0; + done_from_proc[i] = recd_from_proc[i]; + } + + /* flat_buf_idx = current index into flattened buftype + flat_buf_sz = size of current contiguous component in + flattened buf */ + + for (i = 0; i < contig_access_count; i++) { + off = offset_list[i]; + rem_len = len_list[i]; + + /* this request may span the file domains of more than one process */ + while (rem_len > 0) { + len = rem_len; + /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end); + + ADIOI_RD_BUF + + off += len; + rem_len -= len; + } + } + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + recd_from_proc[i] = curr_from_proc[i]; + } + } + + FreeAdioiThree(curr_from_proc, done_from_proc, recv_buf_idx); +} + +static void ADIOI_R_Exchange_data_alltoallv(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, + int *partial_send, int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx) +{ + int i, j, nprocs_recv; + int k = 0; + int tmp = 0; + char **recv_buf = NULL; + MPI_Request *requests = NULL; + MPI_Status *statuses = NULL; + int rtail, stail; + char *sbuf_ptr = NULL; + char *from_ptr = NULL; + int len; + int *sdispls = NULL; + int *rdispls = NULL; + char *all_recv_buf = NULL; + char *all_send_buf = NULL; + + + /* exchange send_size info so that each process knows how much to + receive from whom and how much memory to allocate. */ + MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm); + + nprocs_recv = CalcCount(recv_size, nprocs); + + /* receiver side data structures */ + rdispls = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + rtail = 0; + for (i = 0; i < nprocs; i++) { + rdispls[i] = rtail; + rtail += recv_size[i]; + } + + /* data buffer */ + all_recv_buf = (char *)ADIOI_Malloc(rtail); + recv_buf = (char **)ADIOI_Malloc(nprocs * sizeof(char *)); + for (i = 0; i < nprocs; i++) { + recv_buf[i] = all_recv_buf + rdispls[i]; + } + + /* sender side data structures */ + sdispls = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + stail = 0; + for (i = 0; i < nprocs; i++) { + sdispls[i] = stail; + stail += send_size[i]; + } + + /* data buffer */ + all_send_buf = (char *)ADIOI_Malloc(stail); + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + if (partial_send[i]) { + k = start_pos[i] + count[i] - 1; + tmp = others_req[i].lens[k]; + others_req[i].lens[k] = partial_send[i]; + } + sbuf_ptr = all_send_buf + sdispls[i]; + for (j = 0; j < count[i]; j++) { + ADIOI_ENSURE_AINT_FITS_IN_PTR(others_req[i].mem_ptrs[start_pos[i] + j]); + from_ptr = (char *)ADIOI_AINT_CAST_TO_VOID_PTR(others_req[i].mem_ptrs[start_pos[i] + j]); + len = others_req[i].lens[start_pos[i] + j]; + memcpy(sbuf_ptr, from_ptr, len); + sbuf_ptr += len; + } + if (partial_send[i]) { + others_req[i].lens[k] = tmp; + } + } + } + + /* alltoallv */ + MPI_Alltoallv(all_send_buf, send_size, sdispls, MPI_BYTE, all_recv_buf, recv_size, rdispls, MPI_BYTE, fd->comm); + + /* unpack at the receiver side */ + if (nprocs_recv) { + if (!buftype_is_contig) { + ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned *)recv_size, requests, + statuses, /* never used inside */ + recd_from_proc, nprocs, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent); + } else { + rtail = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + memcpy((char *)buf + buf_idx[i], all_recv_buf + rtail, recv_size[i]); + buf_idx[i] += recv_size[i]; + rtail += recv_size[i]; + } + } + } + } + + FreeAdioiFive(all_send_buf, all_recv_buf, recv_buf, sdispls, rdispls); + return; +} +static int ADIOI_OCEANFS_ReadStridedCollView(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + return ADIOI_OCEANFS_StridedViewIO(fd, buf, count, datatype, file_ptr_type, offset, status, OCEANFS_READ_COLL, error_code); +} +void ADIOI_OCEANFS_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + ADIOI_OCEANFS_fs *ocean_fs = (ADIOI_OCEANFS_fs *)fd->fs_ptr; + ad_oceanfs_group_report(fd, ocean_fs->context->group_id); + + double t = ad_oceanfs_timing_get_time(); + if (fd->hints->fs_hints.oceanfs.view_io == ADIOI_HINT_ENABLE) { + ADIOI_OCEANFS_ReadStridedCollView(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } else { + ADIOI_OCEANFS_oldReadStridedColl(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_R_STRIDED_COLL, t); + return; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdstr.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdstr.c new file mode 100644 index 00000000000..9c5a4fd2c43 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_rdstr.c @@ -0,0 +1,25 @@ +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_pub.h" + +static int ADIOI_OCEANFS_ReadStridedView(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + return ADIOI_OCEANFS_StridedViewIO(fd, buf, count, datatype, file_ptr_type, offset, status, OCEANFS_READ_STRIDED, + error_code); +} + +void ADIOI_OCEANFS_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + double t = ad_oceanfs_timing_get_time(); + if (fd->hints->fs_hints.oceanfs.view_io == ADIOI_HINT_ENABLE) { + ADIOI_OCEANFS_ReadStridedView(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } else { + ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_R_STRIDED, t); + return; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_resize.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_resize.c new file mode 100644 index 00000000000..2c512b565ec --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_resize.c @@ -0,0 +1,42 @@ +#include "ad_oceanfs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "mpi_fs_intf.h" + +void ADIOI_OCEANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code) +{ + int ret, rank; + static char myname[] = "ADIOI_OCEANFS_RESIZE"; + + if (!error_code) { + return; + } + if (!fd) { + *error_code = MPI_ERR_FILE; + return; + } + + if (fd->split_coll_count || fd->async_count) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", + "**io %s", strerror(errno)); + return; + } + MPI_Comm_rank(fd->comm, &rank); + + /* rank 0 process performs ftruncate(), then bcast torest process */ + + if (rank == fd->hints->ranklist[0]) { + ret = mpi_fs_ftruncate(fd->fd_sys, size); + } + /* bcast return value */ + MPI_Bcast(&ret, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); + + if (ret != 0) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_FILE, + "Error in mpi_fs_ftruncate", 0); + } else { + *error_code = MPI_SUCCESS; + } + + return; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.c new file mode 100644 index 00000000000..7b48dc69b9d --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.c @@ -0,0 +1,84 @@ +#include "ad_oceanfs_tuning.h" +#include "ad_oceanfs_file.h" +#include "ad_env.h" +#include "mpi.h" + +static double g_prof_sum[OCEANFSMPIO_CIO_T_FUN_MAX] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int g_prof_cnt[OCEANFSMPIO_CIO_T_FUN_MAX] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static double g_prof_max[OCEANFSMPIO_CIO_T_FUN_MAX] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +double ad_oceanfs_timing_get_time() +{ + return MPI_Wtime(); +} + +static void update_time(int fun, double val) +{ + static const int fun_divisor = 2; + if (fun >= OCEANFSMPIO_CIO_R_CONTIG && fun < OCEANFSMPIO_CIO_T_FUN_MAX) { + g_prof_sum[fun] += val; + g_prof_cnt[fun] += 1; + + if (fun % fun_divisor == 0) { + g_prof_sum[OCEANFSMPIO_CIO_R_TOTAL_CNT]++; + g_prof_max[OCEANFSMPIO_CIO_R_TOTAL_CNT] = g_prof_sum[OCEANFSMPIO_CIO_R_TOTAL_CNT]; + } else { + g_prof_sum[OCEANFSMPIO_CIO_W_TOTAL_CNT]++; + g_prof_max[OCEANFSMPIO_CIO_W_TOTAL_CNT] = g_prof_sum[OCEANFSMPIO_CIO_W_TOTAL_CNT]; + } + + if (val > g_prof_max[fun] + 1e-10) { + g_prof_max[fun] = val; + } + } +} + +void ad_oceanfs_timing_report(ADIO_File fd, int fun, double start_val) +{ + if (!get_oceanfsmpio_timing() || fd->comm == MPI_COMM_NULL) { + return; + } + + double val = ad_oceanfs_timing_get_time() - start_val; + update_time(fun, val); + + double local_avg[OCEANFSMPIO_CIO_T_FUN_MAX]; + int i; + for (i = OCEANFSMPIO_CIO_R_CONTIG; i < OCEANFSMPIO_CIO_R_TOTAL_CNT; i++) { + local_avg[i] = ((g_prof_cnt[i] == 0) ? 0 : (g_prof_sum[i] / g_prof_cnt[i])); + } + local_avg[OCEANFSMPIO_CIO_R_TOTAL_CNT] = g_prof_sum[OCEANFSMPIO_CIO_R_TOTAL_CNT]; + local_avg[OCEANFSMPIO_CIO_W_TOTAL_CNT] = g_prof_sum[OCEANFSMPIO_CIO_W_TOTAL_CNT]; + + static char* g_head_row[] = { + "contig-r", "contig-w", "strided-r", "strided-w", + "strided-coll-r", "strided-coll-w", "oceanfs-r", + "oceanfs-w", "oceanfs-call-r", "oceanfs-call-w" + }; + static int g_row_cnt = 10; + static int g_head_row_size = OCEANFSMPIO_CIO_T_FUN_MAX; + static char* g_head_col[] = { "avg", "max" }; + static int g_col_cnt = 2; + static int g_col_len = 18; + static int g_col_avg_no = 0; + static int g_col_max_no = 1; + static int g_head_col_size = 2; + + static int g_dir_len = 128; + char pname[g_dir_len]; + snprintf(pname, sizeof(pname), "/mpi_state/%d", getpid()); + + TAdOceanfsFile *oceanfs_file = ad_oceanfs_file_init(pname, FILE_CREATE_INTIME, g_row_cnt, g_col_cnt, g_col_len, + g_head_row, g_head_row_size, g_head_col, g_head_col_size); + + if (oceanfs_file == NULL) { + return; + } + + for (i = OCEANFSMPIO_CIO_R_CONTIG; i < OCEANFSMPIO_CIO_T_FUN_MAX; i++) { + ad_oceanfs_file_set_double(oceanfs_file, i, g_col_avg_no, local_avg[i]); + ad_oceanfs_file_set_double(oceanfs_file, i, g_col_max_no, g_prof_max[i]); + } + + ad_oceanfs_file_destroy(oceanfs_file); +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.h new file mode 100644 index 00000000000..2a1b7f9e61e --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_tuning.h @@ -0,0 +1,24 @@ +#ifndef AD_OCEANFS_TUNING_H_ +#define AD_OCEANFS_TUNING_H_ + +#include "adio.h" + +/* timing fields */ +enum { + OCEANFSMPIO_CIO_R_CONTIG = 0, + OCEANFSMPIO_CIO_W_CONTIG, + OCEANFSMPIO_CIO_R_STRIDED, + OCEANFSMPIO_CIO_W_STRIDED, + OCEANFSMPIO_CIO_R_STRIDED_COLL, + OCEANFSMPIO_CIO_W_STRIDED_COLL, + OCEANFSMPIO_CIO_R_OCEANFS, + OCEANFSMPIO_CIO_W_OCEANFS, + OCEANFSMPIO_CIO_R_TOTAL_CNT, + OCEANFSMPIO_CIO_W_TOTAL_CNT, + OCEANFSMPIO_CIO_T_FUN_MAX +}; + +double ad_oceanfs_timing_get_time(); +void ad_oceanfs_timing_report(ADIO_File fd, int fun, double start_val); + +#endif /* AD_OCEANFS_TUNING_H_ */ diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_view.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_view.c new file mode 100644 index 00000000000..296eb06b767 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_view.c @@ -0,0 +1,81 @@ +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_pub.h" +#include "mpi_fs_intf.h" + +static int ADIOI_OCEANFS_datatype_cost(ADIO_File fd, MPI_Datatype etype, MPI_Datatype filetype) +{ + int etype_is_contig; + + ADIOI_Datatype_iscontig(etype, &etype_is_contig); + return etype_is_contig ? 1 : 0; +} + +void ad_view_Init(u32* block_count, u32** blocklens, off_t** blockoffs, off_t* ub_off, int filetype_is_contig, MPI_Count filetype_size, + ADIO_File fd) +{ + ADIOI_Flatlist_node *flat_file = NULL; + if (filetype_is_contig == 1) { + *block_count = 1; + *blocklens = (u32 *)ADIOI_Malloc(sizeof(u32)); + *blockoffs = (off_t *)ADIOI_Malloc(sizeof(off_t)); + (*blockoffs)[0] = 0; + (*blocklens)[0] = (u32)filetype_size; + *ub_off = (off_t)filetype_size; + } else { + flat_file = OCEANFS_Flatten_and_find(fd->filetype); + if (flat_file == NULL) { + ADIOI_Info_set(fd->info, "view_io", "false"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_DISABLE; + return; + } + *block_count = (u32)flat_file->count; + + *blocklens = (u32 *)ADIOI_Malloc(sizeof(u32) * (*block_count)); + *blockoffs = (off_t *)ADIOI_Malloc(sizeof(off_t) * (*block_count)); + int i; + for (i = 0; i < *block_count; i++) { + (*blockoffs)[i] = (off_t)flat_file->indices[i]; + (*blocklens)[i] = (u32)flat_file->blocklens[i]; + } + *ub_off = (*blockoffs)[*block_count - 1] + (off_t)((*blocklens)[*block_count - 1]); + } +} + +int ADIOI_OCEANFS_set_view(ADIO_File fd, int *error_code) +{ + int ret, filetype_is_contig; + u32 block_count; + u32 *blocklens = NULL; + off_t *blockoffs = NULL; + off_t ub_off; + MPI_Count filetype_size; + + MPI_Type_size_x(fd->filetype, &filetype_size); + + if (ADIOI_OCEANFS_datatype_cost(fd, fd->etype, fd->filetype) == 0) { + ADIOI_Info_set(fd->info, "view_io", "false"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_DISABLE; + *error_code = MPI_SUCCESS; + return 0; + } + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (fd->disp < 0) { + ADIOI_Info_set(fd->info, "view_io", "false"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_DISABLE; + return 0; + } + + ad_view_Init(&block_count, &blocklens, &blockoffs, &ub_off, filetype_is_contig, filetype_size, fd); + + ret = mpi_fs_set_fileview(fd->fd_sys, fd->disp, block_count, blocklens, blockoffs, ub_off); + if (ret < 0) { + ADIOI_Info_set(fd->info, "view_io", "false"); + fd->hints->fs_hints.oceanfs.view_io = ADIOI_HINT_DISABLE; + } + + FreeAdioiTwo(blocklens, blockoffs); + return ((ret) < 0) ? 0 : 1; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_viewio.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_viewio.c new file mode 100644 index 00000000000..114d733f955 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_viewio.c @@ -0,0 +1,281 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "mpi_fs_intf.h" + +static int g_max_size = 1048576; // 1024 * 1024 + +void ad_viewio_SetIovec(struct iovec *iov, void *iov_base, size_t iov_len) +{ + iov->iov_base = iov_base; + iov->iov_len = iov_len; +} + +void ad_viewio_ClacFlatBuf(struct iovec *iov, void *buf, ADIO_Offset size_in_buftype, ADIOI_Flatlist_node *flat_buf, int* pi, int* pk, ADIO_Offset j_bufextent) +{ + ADIO_Offset sum = 0; + int i = *pi; + int k = *pk; + for (; i < flat_buf->count; i++) { + if (flat_buf->blocklens[i]) { + sum += flat_buf->blocklens[i]; + iov[k].iov_base = buf + j_bufextent + flat_buf->indices[i]; + if (sum > size_in_buftype) { + iov[k].iov_len = size_in_buftype - sum + flat_buf->blocklens[i]; + break; + } + iov[k].iov_len = flat_buf->blocklens[i]; + k++; + } + i++; + } + *pi = i; + *pk = k; +} + +int ADIOI_OCEANFS_fill_iovec(ADIO_File fd, struct iovec *iov, ADIO_Offset *offset_inbuftypes, void *buf, + MPI_Datatype buftype, ADIOI_Flatlist_node *flat_buf, int *buftype_idx, int *block_idx, int len, int xfered_len) +{ + ADIO_Offset part_len, j_bufextent; + ADIO_Offset n_buftypes, size_in_buftype; + MPI_Aint buftype_extent; + MPI_Count buftype_size; + int buftype_is_contig; + + int i = *block_idx; + int j = *buftype_idx; + int k = 0; + MPI_Type_size_x(buftype, &buftype_size); + MPI_Type_extent(buftype, &buftype_extent); + ADIOI_Datatype_iscontig(buftype, &buftype_is_contig); + + if (buftype_is_contig) { + ad_viewio_SetIovec(iov, buf + *offset_inbuftypes, len); + *offset_inbuftypes += len; + return 1; + } + part_len = flat_buf->blocklens[i] + flat_buf->indices[i] + j * buftype_extent - *offset_inbuftypes; + if (part_len) { + ad_viewio_SetIovec(iov, buf + *offset_inbuftypes, (len < part_len) ? len : part_len); + *offset_inbuftypes += iov[0].iov_len; + part_len -= iov[0].iov_len; + if (part_len) + return 1; + i++; + k = 1; + } + + if (buftype_size == 0) { + return 0; + } + n_buftypes = (xfered_len + len) / buftype_size; + size_in_buftype = (xfered_len + len) % buftype_size; + + for (; j < n_buftypes; j++) { + j_bufextent = j * buftype_extent; + while (i < flat_buf->count) { + if (flat_buf->blocklens[i]) { + ad_viewio_SetIovec(iov + k, buf + j_bufextent + flat_buf->indices[i], flat_buf->blocklens[i]); + k++; + } + i++; + } + i = 0; + } + j_bufextent = j * buftype_extent; + + ad_viewio_ClacFlatBuf(iov, buf, size_in_buftype, flat_buf, &i, &k, j_bufextent); + + *block_idx = i; + *buftype_idx = j; + *offset_inbuftypes = iov[k].iov_base + iov[k].iov_len - buf; + return k; +} + +ADIO_Offset ad_viewio_update_offset_in_file(ADIO_File fd, ADIO_Offset update_bytes, ADIO_Offset curr_offset) +{ + ADIO_Offset abs_off; + ADIO_Offset off; + ADIOI_Flatlist_node *flat_file = NULL; + int i; + ADIO_Offset n_filetypes; + ADIO_Offset abs_off_in_filetype = 0; + ADIO_Offset size_in_filetype, sum; + MPI_Count filetype_size; + int filetype_is_contig; + MPI_Aint filetype_extent; + + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + + abs_off = curr_offset + update_bytes; + if (filetype_is_contig) { + off = abs_off; + } else { + flat_file = ADIOI_Flatten_and_find(fd->filetype); + + MPI_Type_extent(fd->filetype, &filetype_extent); + MPI_Type_size_x(fd->filetype, &filetype_size); + if (!filetype_size) { + return 0; + } + + n_filetypes = abs_off / filetype_size; + size_in_filetype = abs_off - n_filetypes * filetype_size; + + sum = 0; + for (i = 0; i < flat_file->count; i++) { + sum += flat_file->blocklens[i]; + if (sum > size_in_filetype) { + abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + off = n_filetypes * filetype_extent + abs_off_in_filetype; + } + + return off; +} + + +int ADIOI_OCEANFS_StridedViewIO(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int io_flag, int *error_code) +{ + ADIOI_Flatlist_node *flat_buf = NULL; + ADIO_Offset off, file_offset, begin_off, start_offset; + MPI_Count wr_len, sentlen, cur_sendlen; + MPI_Count filetype_size, etype_size, buftype_size; + int buftype_is_contig, filetype_is_contig; + int ret, iovcnt, n_buftype; + struct iovec *iov = NULL; + ADIO_Offset offset_inbuftypes; + int block_idx, buftype_idx; + + *error_code = MPI_SUCCESS; /* changed below if error */ + MPI_Aint buftype_extent; + static char myname[] = "ADIOI_OCEANFS_StridedViewIO"; + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + MPI_Type_size_x(fd->filetype, &filetype_size); + if (!filetype_size) { +#ifdef HAVE_STATUS_SET_BYTES + MPIR_Status_set_bytes(status, datatype, 0); +#endif + *error_code = MPI_SUCCESS; + return -1; + } + + MPI_Type_size_x(datatype, &buftype_size); + MPI_Type_extent(datatype, &buftype_extent); + etype_size = fd->etype_size; + ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count)); + wr_len = buftype_size * count; + start_offset = offset * etype_size; + file_offset = 0; + sentlen = 0; + off = 0; + block_idx = 0; + offset_inbuftypes = 0; + buftype_idx = 0; + + if (buftype_is_contig) { + iov = (struct iovec *)ADIOI_Malloc(sizeof(struct iovec)); + } else { + flat_buf = ADIOI_Flatten_and_find(datatype); + n_buftype = ((wr_len < g_max_size) ? wr_len : g_max_size) / buftype_size + 1; + iov = (struct iovec *)ADIOI_Malloc(sizeof(struct iovec) * flat_buf->count * n_buftype); + offset_inbuftypes = flat_buf->indices[0]; + } + + if (filetype_is_contig) { + off = ((file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : (fd->disp + (ADIO_Offset)etype_size * offset)); + file_offset = off - fd->disp; + } else { + file_offset = 0; + file_offset = ad_viewio_update_offset_in_file(fd, 0, start_offset); + } + + ret = 0; + begin_off = file_offset; + + // begin calculate the iovec (poll) + if (io_flag == OCEANFS_READ_STRIDED) { + if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { + ADIOI_WRITE_LOCK(fd, begin_off + fd->disp, SEEK_SET, wr_len + 1); + } + } else if (io_flag == OCEANFS_WRITE_STRIDED) { + if ((fd->atomicity)) { + ADIOI_WRITE_LOCK(fd, begin_off + fd->disp, SEEK_SET, wr_len + 1); + } + } + while (sentlen < wr_len) { + // OCEANFS_CALCULATE_IOVEC + cur_sendlen = ((g_max_size) < (wr_len - sentlen)) ? g_max_size : (wr_len - sentlen); + iovcnt = ADIOI_OCEANFS_fill_iovec(fd, iov, &offset_inbuftypes, buf, datatype, + flat_buf, &buftype_idx, &block_idx, cur_sendlen, sentlen); + switch (io_flag) { + case OCEANFS_READ_COLL: + case OCEANFS_READ_STRIDED: + ret = mpi_fs_view_read(fd->fd_sys, iovcnt, iov, fd->disp + file_offset); + break; + default: + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, + "Unknown flag", 0); + goto exit; + } + + if (ret < 0) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", + "**io %s", strerror(errno)); + fd->fp_sys_posn = -1; + return -1; + } + file_offset = ad_viewio_update_offset_in_file(fd, ret, sentlen + start_offset); + if (ret == 0) { + break; + } + sentlen += ret; + } + if (io_flag == OCEANFS_READ_STRIDED) { + if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS)) { + ADIOI_UNLOCK(fd, begin_off + fd->disp, SEEK_SET, wr_len + 1); + } + } else if (io_flag == OCEANFS_WRITE_STRIDED) { + if ((fd->atomicity)) { + ADIOI_UNLOCK(fd, begin_off + fd->disp, SEEK_SET, wr_len + 1); + } + } +#ifdef HAVE_STATUS_SET_BYTES + /* what if we only read half a datatype? */ + /* bytes_xfered could be larger than int */ + if (ret != -1) { + MPIR_Status_set_bytes(status, datatype, sentlen); + } +#endif + if (file_ptr_type == ADIO_INDIVIDUAL) { + fd->fp_ind = fd->disp + file_offset; + } + fd->fp_sys_posn = fd->disp + file_offset; + + *error_code = MPI_SUCCESS; +exit: + /* group lock, flush data */ + MPI_Barrier(fd->comm); + ADIO_Flush(fd, error_code); + + OCEANFS_Delete_flattened(datatype); + ADIOI_Free(iov); + return sentlen; +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_wrcoll.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_wrcoll.c new file mode 100644 index 00000000000..0962e012e1d --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/ad_oceanfs_wrcoll.c @@ -0,0 +1,1294 @@ +#include +#include +#include +#include +#include +#include "adio.h" +#include "adio_extern.h" +#include "ad_oceanfs.h" +#include "ad_oceanfs_aggrs.h" +#include "ad_oceanfs_common.h" +#include "ad_oceanfs_pub.h" +#include "ad_oceanfs_group_tuning.h" + +#ifdef AGGREGATION_PROFILE +#include "mpe.h" +#endif +#ifdef PROFILE +#include "mpe.h" +#endif + +/* prototypes of functions used for collective writes only. */ +static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype datatype, int nprocs, int myrank, + ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, OCEANFS_Int *buf_idx, + int *error_code); +static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, + int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, + int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, + MPI_Aint buftype_extent, OCEANFS_Int *buf_idx, int *error_code); +static void ADIOI_W_Exchange_data_alltoallv(ADIO_File fd, const void *buf, char *write_buf, /* 1 */ + ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, + ADIO_Offset off, int size, /* 2 */ + int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, + int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, /* 3 */ + int *done_to_proc, int *hole, /* 4 */ + int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx, int *error_code); +static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node *flat_buf, char **send_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, MPI_Request *requests, int *sent_to_proc, + int nprocs, int myrank, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, + MPI_Aint buftype_extent, int bool_send); +static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements); + +void ADIOI_OCEANFS_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int *error_code) +{ + /* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. */ + + ADIOI_Access *my_req = NULL; + /* array of nprocs access structures, one for each other process in + whose file domain this process's request lies */ + + ADIOI_Access *others_req = NULL; + /* array of nprocs access structures, one for each other process + whose request lies in this process's file domain. */ + + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; + int buftype_is_contig; + int contig_access_count = 0; + int interleave_count = 0; + int *count_my_req_per_proc = NULL; + int count_my_req_procs, count_others_req_procs; + ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; + ADIO_Offset *offset_list = NULL; + ADIO_Offset *st_offsets = NULL; + ADIO_Offset *fd_start = NULL; + ADIO_Offset *fd_end = NULL; + ADIO_Offset *end_offsets = NULL; + ADIO_Offset *count_sizes = NULL; + + OCEANFS_Int *buf_idx = NULL; + ADIO_Offset *len_list = NULL; + double t = ad_oceanfs_timing_get_time(); +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_13, 0, "start computation"); +#endif + + if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { + ADIOI_IOStridedColl(fd, (char *)buf, count, ADIOI_WRITE, datatype, file_ptr_type, offset, status, error_code); + + /* group lock, flush data */ + MPI_Barrier(fd->comm); + ADIO_Flush(fd, error_code); + return; + } + + ADIOI_OCEANFS_fs *ocean_fs = (ADIOI_OCEANFS_fs *)fd->fs_ptr; + ad_oceanfs_group_report(fd, ocean_fs->context->group_id); + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* the number of processes that actually perform I/O, nprocs_for_coll, + * is stored in the hints off the ADIO_File structure + */ + nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* only check for interleaving if cb_write isn't disabled */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ + + /* Note: end_offset points to the last byte-offset that will be accessed. + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ + + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, + &end_offset, &contig_access_count); + + /* each process communicates its start and end offsets to other + processes. The result is an array each of start and end offsets stored + in order of process rank. */ + + st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + + ADIO_Offset my_count_size = 0; + /* One-sided aggregation needs the amount of data per rank as well because + * the difference in starting and ending offsets for 1 byte is 0 the same + * as 0 bytes so it cannot be distiguished. + */ + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + count_sizes = (ADIO_Offset *)ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + MPI_Count buftype_size; + MPI_Type_size_x(datatype, &buftype_size); + my_count_size = (ADIO_Offset)count * (ADIO_Offset)buftype_size; + } + if (get_oceanfsmpio_tunegather()) { + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + AdPubOffsetAggmethod1(fd->comm, nprocs, myrank, + start_offset, end_offset, my_count_size, st_offsets, end_offsets, count_sizes); + } else { + AdPubOffsetAggmethod0(fd->comm, nprocs, myrank, + start_offset, end_offset, st_offsets, end_offsets); + } + } else { + MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); + MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + MPI_Allgather(&count_sizes, 1, ADIO_OFFSET, count_sizes, 1, ADIO_OFFSET, fd->comm); + } + } + + /* are the accesses of different processes interleaved? */ + /* This is a rudimentary check for interleaving, but should suffice + for the moment. */ + for (i = 1; i < nprocs; i++) { + if ((st_offsets[i] < end_offsets[i - 1]) && (st_offsets[i] <= end_offsets[i])) { + interleave_count++; + } + } + } + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + + if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { + /* use independent accesses */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + FreeAdioiFour(offset_list, len_list, st_offsets, end_offsets); + } + + fd->fp_ind = orig_fp; + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + + if (buftype_is_contig && filetype_is_contig) { + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { + off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; + ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); + } else { + ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); + } + } else { + ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); + } + + return; + } + + /* Divide the I/O workload among "nprocs_for_coll" processes. This is + done by (logically) dividing the file into file domains (FDs); each + process may directly access only its own file domain. */ + + int currentValidDataIndex = 0; + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + /* Take out the 0-data offsets by shifting the indexes with data to the front + * and keeping track of the valid data index for use as the length. + */ + for (i = 0; i < nprocs; i++) { + if (count_sizes[i] <= 0) { + continue; + } + st_offsets[currentValidDataIndex] = st_offsets[i]; + end_offsets[currentValidDataIndex] = end_offsets[i]; + currentValidDataIndex++; + } + } + + if (get_oceanfsmpio_tuneblocking()) { + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + ADIOI_OCEANFS_Calc_file_domains(fd, st_offsets, end_offsets, currentValidDataIndex, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, fd->fs_ptr); + } else { + ADIOI_OCEANFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, fd->fs_ptr); + } + } else { + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + ADIOI_Calc_file_domains(st_offsets, end_offsets, currentValidDataIndex, nprocs_for_coll, &min_st_offset, + &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); + } else { + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, + &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); + } + } + + if ((get_oceanfsmpio_write_aggmethod() == OCEANFS_1) || (get_oceanfsmpio_write_aggmethod() == OCEANFS_2)) { + /* If the user has specified to use a one-sided aggregation method then do that at + * this point instead of the two-phase I/O. + */ + int holeFound = 0; + + ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, + st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound); + int anyHolesFound = 0; + if (!get_oceanfsmpio_onesided_no_rmw()) { + MPI_Allreduce(&holeFound, &anyHolesFound, 1, MPI_INT, MPI_MAX, fd->comm); + } + if (anyHolesFound == 0) { + FreeAdioiThree(offset_list, len_list, st_offsets); + FreeAdioiFour(end_offsets, fd_start, fd_end, count_sizes); + goto fn_exit; + } else { + /* Holes are found in the data and the user has not set + * oceanfsmpio_onesided_no_rmw --- set oceanfsmpio_onesided_always_rmw to 1 + * and re-call ADIOI_OneSidedWriteAggregation and if the user has + * oceanfsmpio_onesided_inform_rmw set then inform him of this condition + * and behavior. + */ + + if (get_oceanfsmpio_onesided_inform_rmw() && myrank == 0) { + FPRINTF(stderr, "Information: Holes found during one-sided " + "write aggregation algorithm --- re-running one-sided " + "write aggregation with OCEANFSMPIO_ONESIDED_ALWAYS_RMW set to 1.\n"); + } + set_oceanfsmpio_onesided_always_rmw(1); + int prev_oceanfsmpio_onesided_no_rmw = get_oceanfsmpio_onesided_no_rmw(); + set_oceanfsmpio_onesided_no_rmw(1); + ADIOI_OneSidedWriteAggregation(fd, offset_list, len_list, contig_access_count, buf, datatype, error_code, + st_offsets, end_offsets, currentValidDataIndex, fd_start, fd_end, &holeFound); + set_oceanfsmpio_onesided_no_rmw(prev_oceanfsmpio_onesided_no_rmw); + FreeAdioiThree(offset_list, len_list, st_offsets); + FreeAdioiFour(end_offsets, fd_start, fd_end, count_sizes); + goto fn_exit; + } + } + if (get_oceanfsmpio_p2pcontig() == 1) { + /* For some simple yet common(?) workloads, full-on two-phase I/O is overkill. We can establish sub-groups of + * processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that + * sub-group. + * + * First verify that the filetype is contig and the offsets are + * increasing in rank order */ + int inOrderAndNoGaps = 1; + for (i = 0; i < (nprocs - 1); i++) { + if (end_offsets[i] != (st_offsets[i + 1] - 1)) { + inOrderAndNoGaps = 0; + } + } + if (inOrderAndNoGaps && buftype_is_contig) { + /* if these conditions exist then execute the P2PContig code else + * execute the original code */ + ADIOI_P2PContigWriteAggregation(fd, buf, error_code, st_offsets, end_offsets, fd_start, fd_end); + /* NOTE: we are skipping the rest of two-phase in this path */ + + FreeAdioiThree(offset_list, len_list, st_offsets); + FreeAdioiThree(end_offsets, fd_start, fd_end); + goto fn_exit; + } + } + + /* calculate what portions of the access requests of this process are + located in what file domains */ + + if (get_oceanfsmpio_tuneblocking()) { + ADIOI_OCEANFS_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, + fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); + } else { + ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, + fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); + } + /* based on everyone's my_req, calculate what requests of other + processes lie in this process's file domain. + count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + if (get_oceanfsmpio_tuneblocking()) { + ADIOI_OCEANFS_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, + &count_others_req_procs, &others_req); + } else { + ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, + &count_others_req_procs, &others_req); + } + ADIOI_Free(count_my_req_per_proc); + FreeAccess(my_req, nprocs); + + /* exchange data and write in sizes of no more than coll_bufsize. */ + ADIOI_Exch_and_write(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); + + /* free all memory allocated for collective I/O */ + if (!buftype_is_contig) { + OCEANFS_Delete_flattened(datatype); + } + + FreeAccessAll(others_req, nprocs); + + FreeAdioiThree(buf_idx, offset_list, len_list); + FreeAdioiFour(st_offsets, end_offsets, fd_start, fd_end); + +fn_exit: +#ifdef HAVE_STATUS_SET_BYTES + if (status) { + MPI_Count bufsize, size; + /* Don't set status if it isn't needed */ + MPI_Type_size_x(datatype, &size); + bufsize = size * count; + MPIR_Status_set_bytes(status, datatype, bufsize); + } + /* This is a temporary way of filling in status. The right way is to + keep track of how much data was actually written during collective I/O. */ +#endif + + fd->fp_sys_posn = -1; /* set it to null. */ + /* group lock, flush data */ + MPI_Barrier(fd->comm); + ADIO_Flush(fd, error_code); + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5013, 0, NULL); +#endif + ad_oceanfs_timing_report(fd, OCEANFSMPIO_CIO_W_STRIDED_COLL, t); +} + +/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error + * code is created and returned in error_code. + */ +static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype datatype, int nprocs, int myrank, + ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, OCEANFS_Int *buf_idx, + int *error_code) +{ + /* Send data to appropriate processes and write in sizes of no more + than coll_bufsize. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were written all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to write a distributed + array to a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + ADIO_Offset size; + ADIO_Offset st_loc = -1; + ADIO_Offset end_loc = -1; + int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig; + ADIO_Offset off, done, req_off; + char *write_buf = NULL; + char *write_buf2 = NULL; + int *curr_offlen_ptr = NULL; + int *count = NULL; + int *send_size = NULL; + int req_len; + int *recv_size = NULL; + int *partial_recv = NULL; + int *sent_to_proc = NULL; + int *start_pos = NULL; + int flag; + int *send_buf_idx = NULL; + int *curr_to_proc = NULL; + int *done_to_proc = NULL; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf = NULL; + MPI_Aint buftype_extent; + int info_flag, coll_bufsize; + char *value = NULL; + static char myname[] = "ADIOI_EXCH_AND_WRITE"; + pthread_t io_thread; + void *thread_ret = NULL; + ADIOI_IO_ThreadFuncData io_thread_args; + + size = 0; + *error_code = MPI_SUCCESS; /* changed below if error */ + /* only I/O errors are currently reported */ + + /* calculate the number of writes of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. */ + + value = (char *)ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); + coll_bufsize = atoi(value); + ADIOI_Free(value); + + if (get_oceanfsmpio_pthreadio() == 1) { + /* ROMIO will spawn an additional thread. both threads use separate + * halves of the collective buffer */ + coll_bufsize = coll_bufsize / MUL_2; + } + + CalcLoc(others_req, nprocs, &st_loc, &end_loc); + SetNtimesLocal(&ntimes, st_loc, end_loc, coll_bufsize); + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + + write_buf = fd->io_buf; + if (get_oceanfsmpio_pthreadio() == 1) { + write_buf2 = fd->io_buf + coll_bufsize; + } + + curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* its use is explained below. calloc initializes to 0. */ + + count = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* to store count of how many off-len pairs per proc are satisfied + in an iteration. */ + + partial_recv = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* if only a portion of the last off-len pair is recd. from a process + in a particular iteration, the length recd. is stored here. + calloc initializes to 0. */ + + send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be sent to each proc. in an iteration. + Of size nprocs so that I can use MPI_Alltoall later. */ + + recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be recd. from each proc. in an iteration. */ + + sent_to_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int)); + /* amount of data sent to each proc so far. Used in + ADIOI_Fill_send_buffer. initialized to 0 here. */ + + send_buf_idx = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + curr_to_proc = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + done_to_proc = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* Above three are used in ADIOI_Fill_send_buffer */ + + start_pos = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + /* used to store the starting value of curr_offlen_ptr[i] in + this iteration */ + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + if (!buftype_is_contig) { + flat_buf = ADIOI_Flatten_and_find(datatype); + } + MPI_Type_extent(datatype, &buftype_extent); + + /* I need to check if there are any outstanding nonblocking writes to + the file, which could potentially interfere with the writes taking + place in this collective write call. Since this is not likely to be + common, let me do the simplest thing possible here: Each process + completes all pending nonblocking operations before completing. */ + + done = 0; + off = st_loc; + + if (get_oceanfsmpio_pthreadio() == 1) { + io_thread = pthread_self(); + } + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_14, 0, "end computation"); +#endif + + for (m = 0; m < ntimes; m++) { + /* go through all others_req and check which will be satisfied + by the current write */ + + /* Note that MPI guarantees that displacements in filetypes are in + monotonically nondecreasing order and that, for writes, the + filetypes cannot specify overlapping regions in the file. This + simplifies implementation a bit compared to reads. */ + + /* off = start offset in the file for the data to be written in + this iteration + size = size of data written (bytes) corresponding to off + req_off = off in file for a particular contiguous request + minus what was satisfied in previous iteration + req_size = size corresponding to req_off */ + + /* first calculate what should be communicated */ + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_13, 0, "start computation"); +#endif + for (i = 0; i < nprocs; i++) { + count[i] = 0; + recv_size[i] = 0; + } + + size = ADIOI_MIN((unsigned)coll_bufsize, end_loc - st_loc + 1 - done); + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + if (partial_recv[i]) { + /* this request may have been partially + satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_recv[i]; + req_len = others_req[i].lens[j] - partial_recv[i]; + partial_recv[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < off + size) { + count[i]++; + ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)write_buf) + req_off - off) == + (ADIO_Offset)(MPIU_Upint)(write_buf + req_off - off)); + MPI_Address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j])); + ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off)); + recv_size[i] += (int)(ADIOI_MIN(off + size - req_off, (unsigned)req_len)); + + if (off + size - req_off < (unsigned)req_len) { + partial_recv[i] = (int)(off + size - req_off); + + /* --BEGIN ERROR HANDLING-- */ + if ((j + 1 < others_req[i].count) && (others_req[i].offsets[j + 1] < off + size)) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, + MPI_ERR_ARG, + "Filetype specifies overlapping write regions (which is illegal according to the " + "MPI-2 specification)", + 0); + /* allow to continue since additional + * communication might have to occur + */ + } + /* --END ERROR HANDLING-- */ + break; + } + } else { + break; + } + } + curr_offlen_ptr[i] = j; + } + } + +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_14, 0, "end computation"); + MPE_Log_event(MPE_Log_ID_7, 0, "start communication"); +#endif + if (get_oceanfsmpio_comm() == 1) { + ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, + count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, + m, buftype_extent, buf_idx, error_code); + } else if (get_oceanfsmpio_comm() == 0) { + ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, + off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, + contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); + } + if (*error_code != MPI_SUCCESS) { + return; + } +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_8, 0, "end communication"); +#endif + + flag = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + flag = 1; + break; + } + } + + if (flag) { + char round[50]; + sprintf(round, "two-phase-round=%d", m); + setenv("LIBIOLOG_EXTRA_INFO", round, 1); + ADIOI_Assert(size == (int)size); + if (get_oceanfsmpio_pthreadio() == 1) { + /* there is no such thing as "invalid pthread identifier", so + * we'll use pthread_self() instead. Before we do I/O we want + * to complete I/O from any previous iteration -- but only a + * previous iteration that had I/O work to do (i.e. set 'flag') + */ + if (!pthread_equal(io_thread, pthread_self())) { + pthread_join(io_thread, &thread_ret); + *error_code = *(int *)thread_ret; + if (*error_code != MPI_SUCCESS) + return; + io_thread = pthread_self(); + } + io_thread_args.fd = fd; + /* do a little pointer shuffling: background I/O works from one + * buffer while two-phase machinery fills up another */ + io_thread_args.buf = write_buf; + ADIOI_SWAP(write_buf, write_buf2, char *); + io_thread_args.io_kind = ADIOI_WRITE; + io_thread_args.size = size; + io_thread_args.offset = off; + io_thread_args.status = &status; + io_thread_args.error_code = *error_code; + if ((pthread_create(&io_thread, NULL, ADIOI_IO_Thread_Func, &(io_thread_args))) != 0) { + io_thread = pthread_self(); + } + } else { + ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); + if (*error_code != MPI_SUCCESS) { + return; + } + } + } + + off += size; + done += size; + } + if (get_oceanfsmpio_pthreadio() == 1 && !pthread_equal(io_thread, pthread_self())) { + pthread_join(io_thread, &thread_ret); + *error_code = *(int *)thread_ret; + } + + for (i = 0; i < nprocs; i++) { + count[i] = 0; + recv_size[i] = 0; + } +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_7, 0, "start communication"); +#endif + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to recv, but check for send. */ + if (get_oceanfsmpio_comm() == 1) { + ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, + count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, + m, buftype_extent, buf_idx, error_code); + } else if (get_oceanfsmpio_comm() == 0) { + ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, + off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, + contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); + } + } + if (*error_code != MPI_SUCCESS) { + goto EXIT; + } +#ifdef PROFILE + MPE_Log_event(MPE_Log_ID_8, 0, "end communication"); +#endif + + unsetenv("LIBIOLOG_EXTRA_INFO"); + +EXIT: + FreeAdioiFive(curr_offlen_ptr, count, partial_recv, send_size, recv_size); + FreeAdioiFive(sent_to_proc, start_pos, send_buf_idx, curr_to_proc, done_to_proc); +} + +/* Sets error_code to MPI_SUCCESS if successful, or creates an error code + * in the case of error. + */ +static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, + int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, + int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, + MPI_Aint buftype_extent, OCEANFS_Int *buf_idx, int *error_code) +{ + int i, j, k, sum; + int *tmp_len = NULL; + int nprocs_recv, nprocs_send, err; + char **send_buf = NULL; + MPI_Request *requests = NULL; + MPI_Request *send_req = NULL; + MPI_Datatype *recv_types = NULL; + MPI_Status *statuses = NULL; + MPI_Status status; + int *srt_len = NULL; + ADIO_Offset *srt_off = NULL; + static char myname[] = "ADIOI_W_EXCHANGE_DATA"; + + /* exchange recv_size info so that each process knows how much to + send to whom. */ + + MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); + + /* create derived datatypes for recv */ + nprocs_recv = CalcCount(recv_size, nprocs); + + /* +1 to avoid a 0-size malloc */ + recv_types = (MPI_Datatype *)ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); + + tmp_len = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + /* take care if the last off-len pair is a partial recv */ + if (partial_recv[i]) { + k = start_pos[i] + count[i] - 1; + tmp_len[i] = others_req[i].lens[k]; + others_req[i].lens[k] = partial_recv[i]; + } + ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types + j); + /* absolute displacements; use MPI_BOTTOM in recv */ + MPI_Type_commit(recv_types + j); + j++; + } + } + + /* To avoid a read-modify-write, check if there are holes in the + data to be written. For this, merge the (sorted) offset lists + others_req using a heap-merge. */ + + sum = 0; + for (i = 0; i < nprocs; i++) { + sum += count[i]; + } + srt_off = (ADIO_Offset *)ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset)); + srt_len = (int *)ADIOI_Malloc((sum + 1) * sizeof(int)); + + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum); + + /* for partial recvs, restore original lengths */ + for (i = 0; i < nprocs; i++) { + if (partial_recv[i]) { + k = start_pos[i] + count[i] - 1; + others_req[i].lens[k] = tmp_len[i]; + } + } + ADIOI_Free(tmp_len); + + /* check if there are any holes. If yes, must do read-modify-write. + * holes can be in three places. 'middle' is what you'd expect: the + * processes are operating on noncontigous data. But holes can also show + * up at the beginning or end of the file domain (see John Bent ROMIO REQ + * #835). Missing these holes would result in us writing more data than + * recieved by everyone else. */ + *hole = 0; + if (off != srt_off[0]) { /* hole at the front */ + *hole = 1; + } else { /* coalesce the sorted offset-length pairs */ + for (i = 1; i < sum; i++) { + if (srt_off[i] <= srt_off[0] + srt_len[0]) { + int new_len = srt_off[i] + srt_len[i] - srt_off[0]; + if (new_len > srt_len[0]) { + srt_len[0] = new_len; + } + } else { + break; + } + } + if (i < sum || size != srt_len[0]) { /* hole in middle or end */ + *hole = 1; + } + } + + FreeAdioiTwo(srt_off, srt_len); + + if (nprocs_recv) { + if (*hole) { + const char *stuff = "data-sieve-in-two-phase"; + setenv("LIBIOLOG_EXTRA_INFO", stuff, 1); + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); + /* --BEGIN ERROR HANDLING-- */ + if (err != MPI_SUCCESS) { + *error_code = + MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); + ADIOI_Free(recv_types); + return; + } + /* --END ERROR HANDLING-- */ + unsetenv("LIBIOLOG_EXTRA_INFO"); + } + } + + nprocs_send = CalcCount(send_size, nprocs); + + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + requests = (MPI_Request *)ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request)); + send_req = requests; + } else { + requests = (MPI_Request *)ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); + /* +1 to avoid a 0-size malloc */ + + /* post receives */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + MUL_100 * iter, fd->comm, requests + j); + j++; + } + } + send_req = requests + nprocs_recv; + } + + /* post sends. if buftype_is_contig, data can be directly sent from + user buf at location given by buf_idx. else use send_buf. */ + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5032, 0, NULL); +#endif + if (buftype_is_contig) { + j = 0; + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + MPI_Isend(((char *)buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank + i + MUL_100 * iter, fd->comm, + send_req + j); + j++; + buf_idx[i] += send_size[i]; + } + } + } else if (nprocs_send) { + /* buftype is not contig */ + send_buf = (char **)ADIOI_Malloc(nprocs * sizeof(char *)); + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + send_buf[i] = (char *)ADIOI_Malloc(send_size[i]); + } + } + ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, + nprocs, myrank, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, + done_to_proc, iter, buftype_extent, 1); + /* the send is done in ADIOI_Fill_send_buffer */ + } + + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + j = 0; + for (i = 0; i < nprocs; i++) { + MPI_Status wkl_status; + if (recv_size[i]) { + MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + MUL_100 * iter, fd->comm, &wkl_status); + j++; + } + } + } + + for (i = 0; i < nprocs_recv; i++) + MPI_Type_free(recv_types + i); + ADIOI_Free(recv_types); + + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + statuses = (MPI_Status *)ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + } else { + statuses = (MPI_Status *)ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + } + +#ifdef NEEDS_MPI_TEST + i = 0; + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + while (!i) { + MPI_Testall(nprocs_send, send_req, &i, statuses); + } + } else { + while (!i) { + MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); + } + } +#else + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + MPI_Waitall(nprocs_send, send_req, statuses); + } else { + MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); + } +#endif + +#ifdef AGGREGATION_PROFILE + MPE_Log_event(MPE_Log_ID_5033, 0, NULL); +#endif + ADIOI_Free(statuses); + ADIOI_Free(requests); + if (!buftype_is_contig && nprocs_send) { + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + ADIOI_Free(send_buf[i]); + } + } + ADIOI_Free(send_buf); + } +} + +#define ADIOI_BUF_COPY \ + { \ + while (size) { \ + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ + ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + user_buf_idx) == \ + (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + user_buf_idx)); \ + ADIOI_Assert(size_in_buf == (size_t)size_in_buf); \ + memcpy(&(send_buf[p][send_buf_idx[p]]), ((char *)buf) + user_buf_idx, size_in_buf); \ + send_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) \ + flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = \ + flat_buf->indices[flat_buf_idx] + (ADIO_Offset)n_buftypes * (ADIO_Offset)buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + } \ + AD_COLL_BUF_INCR \ + } + +#define ADIOI_BUF \ + do { \ + if (send_buf_idx[p] < send_size[p]) { \ + if (curr_to_proc[p] + len > done_to_proc[p]) { \ + if (done_to_proc[p] > curr_to_proc[p]) { \ + size = ADIOI_MIN(curr_to_proc[p] + len - done_to_proc[p], send_size[p] - send_buf_idx[p]); \ + buf_incr = done_to_proc[p] - curr_to_proc[p]; \ + AD_COLL_BUF_INCR \ + ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) \ + == (unsigned)(curr_to_proc[p] + len - done_to_proc[p])); \ + buf_incr = curr_to_proc[p] + len - done_to_proc[p]; \ + ADIOI_Assert((done_to_proc[p] + size) == (unsigned)(done_to_proc[p] + size)); \ + curr_to_proc[p] = done_to_proc[p] + size; \ + ADIOI_BUF_COPY \ + } else { \ + size = ADIOI_MIN(len, send_size[p] - send_buf_idx[p]); \ + buf_incr = len; \ + ADIOI_Assert((curr_to_proc[p] + size) == (unsigned)((ADIO_Offset)curr_to_proc[p] + size)); \ + curr_to_proc[p] += size; \ + ADIOI_BUF_COPY \ + } \ + if (bool_send && send_buf_idx[p] == send_size[p]) { \ + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, \ + myrank + p + MUL_100 * iter, fd->comm, requests + jj); \ + jj++; \ + } \ + } else { \ + ADIOI_Assert((curr_to_proc[p] + len) == (unsigned)((ADIO_Offset)curr_to_proc[p] + len)); \ + curr_to_proc[p] += (int)len; \ + buf_incr = len; \ + AD_COLL_BUF_INCR \ + } \ + } else { \ + buf_incr = len; \ + AD_COLL_BUF_INCR \ + } \ + } while (0) + +typedef struct { + ADIO_Offset *off_list; + ADIO_Offset *len_list; + int nelem; +} HeapStruct; + +static void HeapCopy(HeapStruct* dst, HeapStruct* src) +{ + dst->off_list = src->off_list; + dst->len_list = src->len_list; + dst->nelem = src->nelem; +} + +static void HeapSwap(HeapStruct* l, HeapStruct* r) +{ + HeapStruct tmp; + HeapCopy(&tmp, l); + HeapCopy(l, r); + HeapCopy(r, &tmp); +} + +static void HeapMerge(HeapStruct *a, int k, int heapsize) +{ + int l, r, smallest; + + while (1) { + l = MUL_2 * (k + 1) - 1; + r = MUL_2 * (k + 1); + + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) { + smallest = l; + } else { + smallest = k; + } + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) { + smallest = r; + } + + if (smallest != k) { + HeapSwap(a + k, a + smallest); + k = smallest; + } else { + break; + } + } +} + +static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements) +{ + HeapStruct *a; + int i, j, heapsize; + + a = (HeapStruct *)ADIOI_Malloc((nprocs_recv + 1) * sizeof(HeapStruct)); + + j = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + a[j].off_list = &(others_req[i].offsets[start_pos[i]]); + a[j].len_list = &(others_req[i].lens[start_pos[i]]); + a[j].nelem = count[i]; + j++; + } + } + + /* build a heap out of the first element from each list, with + the smallest element of the heap at the root */ + + heapsize = nprocs_recv; + for (i = heapsize / MUL_2 - 1; i >= 0; i--) { + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 + modified for a heap with smallest element at root. I have + removed the recursion so that there are no function calls. + Function calls are too expensive. */ + HeapMerge(a, i, heapsize); + } + + for (i = 0; i < total_elements; i++) { + /* extract smallest element from heap, i.e. the root */ + srt_off[i] = *(a[0].off_list); + srt_len[i] = *(a[0].len_list); + (a[0].nelem)--; + + if (!a[0].nelem) { + HeapCopy(a, a + heapsize - 1); + heapsize--; + } else { + (a[0].off_list)++; + (a[0].len_list)++; + } + + HeapMerge(a, 0, heapsize); + } + + ADIOI_Free(a); +} + +void ad_wrcoll_CheckHole(ADIO_Offset *srt_off, ADIO_Offset off, int *srt_len, int sum, int size, int* hole) +{ + int i; + *hole = 0; + /* See if there are holes before the first request or after the last request */ + if ((srt_off[0] > off) || ((srt_off[sum - 1] + srt_len[sum - 1]) < (off + size))) { + *hole = 1; + } else { /* See if there are holes between the requests, if there are more than one */ + for (i = 0; i < sum - 1; i++) { + if (srt_off[i] + srt_len[i] < srt_off[i + 1]) { + *hole = 1; + break; + } + } + } +} + +static void ADIOI_W_Exchange_data_alltoallv(ADIO_File fd, const void *buf, char *write_buf, /* 1 */ + ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, + ADIO_Offset off, int size, /* 2 */ + int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, + int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, /* 3 */ + int *done_to_proc, int *hole, /* 4 */ + int iter, MPI_Aint buftype_extent, OCEANFS_Int *buf_idx, int *error_code) +{ + int i, j, nprocs_recv, err; + int k; + int *tmp_len = NULL; + char **send_buf = NULL; + MPI_Request *send_req = NULL; + MPI_Status status; + int rtail, stail; + char *sbuf_ptr = NULL; + char *to_ptr = NULL; + int len, sum; + int *sdispls = NULL; + int *rdispls = NULL; + char *all_recv_buf = NULL; + char *all_send_buf = NULL; + int *srt_len = NULL; + ADIO_Offset *srt_off = NULL; + static char myname[] = "ADIOI_W_EXCHANGE_DATA"; + + /* exchange recv_size info so that each process knows how much to + send to whom. */ + MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); + + nprocs_recv = CalcCount(recv_size, nprocs); + + /* receiver side data structures */ + rdispls = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + rtail = 0; + for (i = 0; i < nprocs; i++) { + rdispls[i] = rtail; + rtail += recv_size[i]; + } + + /* data buffer */ + all_recv_buf = (char *)ADIOI_Malloc(rtail); + + /* sender side data structures */ + sdispls = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + stail = 0; + for (i = 0; i < nprocs; i++) { + sdispls[i] = stail; + stail += send_size[i]; + } + + /* data buffer */ + all_send_buf = (char *)ADIOI_Malloc(stail); + if (buftype_is_contig) { + for (i = 0; i < nprocs; i++) { + if (send_size[i] == 0) { + continue; + } + sbuf_ptr = all_send_buf + sdispls[i]; + memcpy(sbuf_ptr, buf + buf_idx[i], send_size[i]); + buf_idx[i] += send_size[i]; + } + } else { + send_buf = (char **)ADIOI_Malloc(nprocs * sizeof(char *)); + for (i = 0; i < nprocs; i++) { + send_buf[i] = all_send_buf + sdispls[i]; + } + ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, + sent_to_proc, nprocs, myrank, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, + curr_to_proc, done_to_proc, iter, buftype_extent, 0); + ADIOI_Free(send_buf); + } + + /* alltoallv */ + MPI_Alltoallv(all_send_buf, send_size, sdispls, MPI_BYTE, all_recv_buf, recv_size, rdispls, MPI_BYTE, fd->comm); + + FreeAdioiTwo(all_send_buf, sdispls); + + /* data sieving pre-read */ + /* To avoid a read-modify-write, check if there are holes in the + data to be written. For this, merge the (sorted) offset lists + others_req using a heap-merge. */ + + sum = 0; + for (i = 0; i < nprocs; i++) { + sum += count[i]; + } + srt_off = (ADIO_Offset *)ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset)); + srt_len = (int *)ADIOI_Malloc((sum + 1) * sizeof(int)); + + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum); + + /* check if there are any holes */ + ad_wrcoll_CheckHole(srt_off, off, srt_len, sum, size, hole); + FreeAdioiTwo(srt_off, srt_len); + + if (nprocs_recv && *hole) { + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); + /* --BEGIN ERROR HANDLING-- */ + if (err != MPI_SUCCESS) { + *error_code = + MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); + return; + } + /* --END ERROR HANDLING-- */ + } + + /* scater all_recv_buf into 4M cb_buffer */ + tmp_len = (int *)ADIOI_Malloc(nprocs * sizeof(int)); + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) { + continue; + } + if (partial_recv[i]) { + k = start_pos[i] + count[i] - 1; + tmp_len[i] = others_req[i].lens[k]; + others_req[i].lens[k] = partial_recv[i]; + } + + sbuf_ptr = all_recv_buf + rdispls[i]; + for (j = 0; j < count[i]; j++) { + ADIOI_ENSURE_AINT_FITS_IN_PTR(others_req[i].mem_ptrs[start_pos[i] + j]); + to_ptr = (char *)ADIOI_AINT_CAST_TO_VOID_PTR(others_req[i].mem_ptrs[start_pos[i] + j]); + len = others_req[i].lens[start_pos[i] + j]; + memcpy(to_ptr, sbuf_ptr, len); + sbuf_ptr += len; + } + + /* restore */ + if (partial_recv[i]) { + k = start_pos[i] + count[i] - 1; + others_req[i].lens[k] = tmp_len[i]; + } + } + + FreeAdioiThree(tmp_len, all_recv_buf, rdispls); + return; +} + +void ad_wrcoll_InitVec(int nprocs, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *sent_to_proc) +{ + int i; + for (i = 0; i < nprocs; i++) { + send_buf_idx[i] = 0; + curr_to_proc[i] = 0; + done_to_proc[i] = sent_to_proc[i]; + } +} + +void ad_wrcoll_CopyProcVec(int nprocs, int *send_size, int *sent_to_proc, int *curr_to_proc) +{ + int i; + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + sent_to_proc[i] = curr_to_proc[i]; + } + } +} + +static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node *flat_buf, char **send_buf, + ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, MPI_Request *requests, int *sent_to_proc, + int nprocs, int myrank, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, + MPI_Aint buftype_extent, int bool_send) +{ + /* this function is only called if buftype is not contig */ + int i, p, jj, flat_buf_idx, n_buftypes; + ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size; + ADIO_Offset off, len, rem_len, user_buf_idx; + + /* curr_to_proc[p] = amount of data sent to proc. p that has already + been accounted for so far + done_to_proc[p] = amount of data already sent to proc. p in + previous iterations + user_buf_idx = current location in user buffer + send_buf_idx[p] = current location in send_buf of proc. p */ + ad_wrcoll_InitVec(nprocs, send_buf_idx, curr_to_proc, done_to_proc, sent_to_proc); + + user_buf_idx = flat_buf->indices[0]; + flat_buf_sz = flat_buf->blocklens[0]; + + jj = 0; + flat_buf_idx = 0; + n_buftypes = 0; + + /* flat_buf_idx = current index into flattened buftype + flat_buf_sz = size of current contiguous component in + flattened buf */ + for (i = 0; i < contig_access_count; i++) { + off = offset_list[i]; + rem_len = len_list[i]; + + /* this request may span the file domains of more than one process */ + while (rem_len != 0) { + len = rem_len; + /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = ADIOI_OCEANFS_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end); + + ADIOI_BUF; + off += len; + rem_len -= len; + } + } + + ad_wrcoll_CopyProcVec(nprocs, send_size, sent_to_proc, curr_to_proc); +} diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.c b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.c new file mode 100644 index 00000000000..1054a9a58cd --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.c @@ -0,0 +1,640 @@ +#include "mpi_fs_intf.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "ad_oceanfs_pub.h" +#include "adio.h" + + +#ifdef DEBUG +#define ASSERT(f) assert(f) +#else +#define ASSERT(f) ((void)0) +#endif + +void* mpi_zalloc(uint32_t n) +{ + if (n <= 0) { + return NULL; + } + + char* p = malloc(n); + if (p == NULL) { + return NULL; + } + memset(p, 0, n); + return p; +} + +#define mpi_free(p) \ + do { \ + free(p); \ + (p) = NULL; \ + } while (0) + +#define atomic_t int +#define atomic_inc(v) __sync_fetch_and_add(v, 1) +#define atomic_dec_and_test(v) (__sync_fetch_and_sub(v, 1) == 1) +#define atomic_set(v, i) ((*(v)) = (i)) + + +#define CHECK_NULL_POINTER(x, ret) \ + do { \ + if ((x) == NULL) \ + { \ + return (ret); \ + } \ + }while (0) + +#define TRUE 1 +#define FALSE 0 + +#ifndef MIN + #define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX + #define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif + + +#define MAX_VIEW_READ_SIZE (4 * 1024 * 1024) +#define MPI_FD_HANDLE_HASHTBL_SIZE 64 + +#define ODCS_IOC_MPI_VIEW_READ _IOWR('S', 101, mpi_fs_view_read_t) +#define ODCS_IOC_MPI_PREFETCH _IOWR('S', 102, mpi_fs_view_read_t) +#define ODCS_IOC_IOCTL_MPI_GROUP_ID _IOWR('S', 103, mpi_fs_group_id_t) +#define ODCS_IOC_IOCTL_MPI_SET_GROUP_ID _IOWR('S', 111, mpi_fs_group_id_t) + +static pthread_mutex_t g_init_mpi_fh_lock = PTHREAD_MUTEX_INITIALIZER; +static int g_init_mpi_fh_pid = 0; + +typedef struct mpi_fs_group_id { + int fd; + uint64_t group_id; +} mpi_fs_group_id_t; + +typedef enum mpi_fh_state { + MPI_FH_STATE_INUSE, + MPI_FH_STATE_DELETE +} mpi_fh_state_t; + +typedef struct mpi_fs_view { + off_t offset; + u32 count; + u32 *blocklens; + off_t *blockoffs; + off_t ub_off; + char data[0]; +} mpi_fs_view_t; + +typedef struct mpi_fs_view_read { + off_t offset; + uint32_t readLen; + uint32_t readRangeLen; + uint32_t readRangeCount; + uint32_t *blocklens; + off_t *blockoffs; + char data[0]; +} mpi_fs_view_read_t; + +typedef struct list_head { + struct list_head *next, *prev; +} list_head_t; + +typedef struct mpi_list_with_lock_s { + list_head_t list; +} mpi_list_with_lock; + +typedef struct mpi_fh_hashtbl { + mpi_list_with_lock ht[MPI_FD_HANDLE_HASHTBL_SIZE]; +} mpi_fh_hashtbl_t; + +static mpi_fh_hashtbl_t g_mpi_fh_hashtbl; + +#define INIT_LIST_HEAD(ptr) \ + do { \ + (ptr)->next = (ptr); \ + (ptr)->prev = (ptr); \ + } while (0) + +#define list_entry(ptr, type, member) \ + ((type *)(void *)((char *)(ptr) - offsetof(type, member))) + +static void list_add_tail(list_head_t *new_head, list_head_t *head) +{ + list_head_t *prev = head->prev; + list_head_t *next = head; + + next->prev = new_head; + new_head->next = next; + new_head->prev = prev; + prev->next = new_head; +} + +static void list_del_init(struct list_head *entry) +{ + list_head_t *prev = entry->prev; + list_head_t *next = entry->next; + + next->prev = prev; + prev->next = next; + + INIT_LIST_HEAD(entry); +} + +typedef struct mpi_fd_handle { + list_head_t node; + int fd; + pthread_mutex_t lock; + atomic_t ref_count; + mpi_fh_state_t state; + mpi_fs_view_t *view; +} mpi_fh_handle_t; + +static void init_mpi_hashtbl(mpi_list_with_lock *ht, int hash_size) +{ + if (ht == NULL) { + return; + } + + int i; + for (i = 0; i < hash_size; i++) { + INIT_LIST_HEAD(&ht[i].list); + } +} + +static int init_mpi_fh_table(void) +{ + pthread_mutex_lock(&g_init_mpi_fh_lock); + if (getpid() == g_init_mpi_fh_pid) { + pthread_mutex_unlock(&g_init_mpi_fh_lock); + return 0; + } + + init_mpi_hashtbl(g_mpi_fh_hashtbl.ht, MPI_FD_HANDLE_HASHTBL_SIZE); + + g_init_mpi_fh_pid = getpid(); + + pthread_mutex_unlock(&g_init_mpi_fh_lock); + + return 0; +} + +static int insert_mpi_fh_table(int fd) +{ + int ret = init_mpi_fh_table(); + if (ret != 0) { + return ret; + } + + if (fd < 0) { + return -1; + } + + mpi_fh_handle_t *fh = (mpi_fh_handle_t *)mpi_zalloc(sizeof(mpi_fh_handle_t)); + if (fh == NULL) { + return -1; + } + + fh->fd = fd; + atomic_set(&fh->ref_count, 1); + fh->state = MPI_FH_STATE_INUSE; + pthread_mutex_init(&fh->lock, NULL); + INIT_LIST_HEAD(&fh->node); + + int bucket = fd % MPI_FD_HANDLE_HASHTBL_SIZE; + pthread_mutex_lock(&g_init_mpi_fh_lock); + list_add_tail(&fh->node, &g_mpi_fh_hashtbl.ht[bucket].list); + pthread_mutex_unlock(&g_init_mpi_fh_lock); + + return 0; +} + +int mpi_fs_open(const char *pathname, int flags, mode_t mode) +{ + mode &= ~S_IFMT; + mode |= S_IFREG; + + int fd = open(pathname, flags, mode); + if (fd >= 0) { + if (insert_mpi_fh_table(fd) != 0) { + close(fd); + return -1; + } + } + + return fd; +} + +int mpi_fs_pread(int fd, void *buf, size_t count, off_t offset) +{ + return (int)pread(fd, buf, count, offset); +} + +int mpi_fs_pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + return (int)pwrite(fd, buf, count, offset); +} + +int mpi_fs_stat(const char *pathname, struct stat *buf) +{ + return stat(pathname, buf); +} + +static mpi_fh_handle_t *find_mpi_fh_in_ht(int fd, int is_del) +{ + mpi_fh_handle_t *fh = NULL; + list_head_t *pos = NULL; + list_head_t *next = NULL; + + int bucket = fd % MPI_FD_HANDLE_HASHTBL_SIZE; + pthread_mutex_lock(&g_init_mpi_fh_lock); + + list_head_t* head = &g_mpi_fh_hashtbl.ht[bucket].list; + for (pos = head->next, next = pos->next; pos != head; pos = next, next = pos->next) { + fh = list_entry(pos, mpi_fh_handle_t, node); + if (fd == fh->fd) { + if (is_del) { + list_del_init(&fh->node); + } + atomic_inc(&fh->ref_count); + pthread_mutex_unlock(&g_init_mpi_fh_lock); + return fh; + } + } + pthread_mutex_unlock(&g_init_mpi_fh_lock); + + return NULL; +} + +static void mpi_fh_put(mpi_fh_handle_t *fh) +{ + if (fh == NULL) { + return; + } + + if (!atomic_dec_and_test(&fh->ref_count)) { + return; + } + + pthread_mutex_lock(&fh->lock); + mpi_fh_state_t status = fh->state; + pthread_mutex_unlock(&fh->lock); + + if (status == MPI_FH_STATE_INUSE) { + ASSERT(0); + return; + } + + if (status == MPI_FH_STATE_DELETE) { + mpi_free(fh->view); + mpi_free(fh); + } +} + +static void mpi_delete_fd_handle(int fd) +{ + int ret = init_mpi_fh_table(); + if (ret != 0) { + return; + } + + if (fd < 0) { + return; + } + + mpi_fh_handle_t *fh = find_mpi_fh_in_ht(fd, TRUE); + if (fh == NULL) { + return; + } + + pthread_mutex_lock(&fh->lock); + fh->state = MPI_FH_STATE_DELETE; + pthread_mutex_unlock(&fh->lock); + + mpi_fh_put(fh); + + mpi_fh_put(fh); +} + +int mpi_fs_close(int fd) +{ + int ret = close(fd); + if (ret == 0) { + mpi_delete_fd_handle(fd); + } + + return ret; +} + +int mpi_fs_ftruncate(int fd, off_t length) +{ + return (int)ftruncate(fd, length); +} + +off_t mpi_fs_lseek(int fd, off_t offset, int whence) +{ + return lseek(fd, offset, whence); +} + +static mpi_fs_view_t *AllocMPIFSView(uint32_t count) +{ + if (count == 0) { + return NULL; + } + + mpi_fs_view_t *view = mpi_zalloc(sizeof(mpi_fs_view_t) + count * sizeof(uint32_t) + count * sizeof(off_t)); + if (view == NULL) { + return NULL; + } + + view->count = count; + view->blocklens = (u32 *)view->data; + view->blockoffs = (off_t *)(view->data + count * sizeof(uint32_t)); + + return view; +} + +static mpi_fh_handle_t *mpi_fh_get(int fd) +{ + int ret = init_mpi_fh_table(); + if (ret != 0 || fd < 0) { + return NULL; + } + + return find_mpi_fh_in_ht(fd, FALSE); +} + +int mpi_fs_set_fileview(int fd, off_t offset, u32 count, u32 *blocklens, off_t *blockoffs, off_t ub_off) +{ + mpi_fh_handle_t *fh = mpi_fh_get(fd); + if (fh == NULL) { + errno = EINVAL; + return -1; + } + + mpi_fs_view_t *view = AllocMPIFSView(count); + if (view == NULL) { + mpi_fh_put(fh); + errno = EINVAL; + return -1; + } + + pthread_mutex_lock(&fh->lock); + mpi_free(fh->view); + + fh->view = view; + memcpy(fh->view->blocklens, blocklens, count * sizeof(uint32_t)); + memcpy(fh->view->blockoffs, blockoffs, count * sizeof(off_t)); + + fh->view->offset = offset; + fh->view->count = count; + fh->view->ub_off = ub_off; + pthread_mutex_unlock(&fh->lock); + + mpi_fh_put(fh); + return 0; +} + +static uint32_t GetBlockCntInMPIView(off_t readStart, uint32_t readLen, mpi_fs_view_t *mpiView) +{ + const uint32_t filetypeLen = mpiView->ub_off - mpiView->offset; + const off_t displacement = mpiView->offset; + + off_t posInFiletype = (readStart - displacement) % filetypeLen; + uint32_t cntBlockRead = 0; + uint32_t bytesLeft = readLen; + + uint32_t i; + uint32_t bytesToRead; + while (bytesLeft > 0) { + for (i = 0; i < mpiView->count; i++) { + if (bytesLeft == 0) { + break; + } + if (mpiView->blocklens[i] == 0) { + continue; + } + if (posInFiletype <= mpiView->blockoffs[i]) { + bytesToRead = mpiView->blocklens[i]; + bytesToRead = (bytesLeft > bytesToRead) ? bytesToRead : bytesLeft; + bytesLeft -= bytesToRead; + + posInFiletype += bytesToRead; + cntBlockRead++; + } else if (posInFiletype < (mpiView->blockoffs[i] + mpiView->blocklens[i])) { + bytesToRead = mpiView->blockoffs[i] + mpiView->blocklens[i] - posInFiletype; + bytesToRead = (bytesLeft > bytesToRead) ? bytesToRead : bytesLeft; + bytesLeft -= bytesToRead; + + posInFiletype += bytesToRead; + cntBlockRead++; + } else { + continue; + } + } + + posInFiletype = 0; + } + + return cntBlockRead; +} + +static mpi_fs_view_read_t *AllocMPIViewRead(off_t offset, uint32_t readLen, uint32_t readRangeCount) +{ + mpi_fs_view_read_t *viewRead = NULL; + uint32_t readRangeLen = readRangeCount * (sizeof(uint32_t) + sizeof(off_t)); + uint32_t viewReadSize = sizeof(mpi_fs_view_read_t) + readLen + readRangeLen; + + if (readRangeCount == 0) { + return NULL; + } + + if (readLen > MAX_VIEW_READ_SIZE || viewReadSize > MAX_VIEW_READ_SIZE) { + return NULL; + } + + viewRead = mpi_zalloc(viewReadSize); + if (viewRead == NULL) { + return NULL; + } + + viewRead->offset = offset; + viewRead->readLen = readLen; + viewRead->readRangeLen = readRangeLen; + viewRead->readRangeCount = readRangeCount; + viewRead->blocklens = (uint32_t *)(viewRead->data + readLen); + viewRead->blockoffs = (off_t *)(viewRead->data + readLen + (readRangeCount * sizeof(uint32_t))); + + return viewRead; +} + +void MakeMPIViewReadRange(mpi_fs_view_read_t *viewRead, mpi_fs_view_t *mpiView) +{ + uint32_t i; + off_t readStart = viewRead->offset; + uint32_t count = viewRead->readRangeCount; + + const uint32_t filetypeLen = mpiView->ub_off - mpiView->offset; + const off_t displacement = mpiView->offset; + + off_t posInFiletype = (readStart - displacement) % filetypeLen; + uint32_t posFiletype = (readStart - displacement) / filetypeLen; + uint32_t cntBlockRead = 0; + uint32_t bytesLeft = viewRead->readLen; + + uint32_t bytesToRead; + while (cntBlockRead < count && bytesLeft > 0) { + for (i = 0; i < mpiView->count; i++) { + if (cntBlockRead >= count || bytesLeft == 0) { + break; + } + if (mpiView->blocklens[i] == 0) { + continue; + } + if (posInFiletype <= mpiView->blockoffs[i]) { + bytesToRead = mpiView->blocklens[i]; + bytesToRead = (bytesLeft > bytesToRead) ? bytesToRead : bytesLeft; + bytesLeft -= bytesToRead; + + viewRead->blocklens[cntBlockRead] = bytesToRead; + viewRead->blockoffs[cntBlockRead] = displacement + posFiletype * filetypeLen + mpiView->blockoffs[i]; + + posInFiletype += bytesToRead; + cntBlockRead++; + } else if (posInFiletype < (mpiView->blockoffs[i] + mpiView->blocklens[i])) { + bytesToRead = mpiView->blockoffs[i] + mpiView->blocklens[i] - posInFiletype; + bytesToRead = (bytesLeft > bytesToRead) ? bytesToRead : bytesLeft; + bytesLeft -= bytesToRead; + + viewRead->blocklens[cntBlockRead] = bytesToRead; + viewRead->blockoffs[cntBlockRead] = displacement + posFiletype * filetypeLen + posInFiletype; + + posInFiletype += bytesToRead; + cntBlockRead++; + } else { + continue; + } + } + + posFiletype++; + posInFiletype = 0; + } +} + +static u32 CopyBuffToIov(const void *buff, u32 bufLen, struct iovec *iov, u32 iovLen) +{ + uint32_t pos = 0; + uint32_t i; + int ret = 0; + + CHECK_NULL_POINTER(buff, EINVAL); + CHECK_NULL_POINTER(iov, EINVAL); + + uint32_t left_len = (uint32_t)bufLen; + uint32_t copy_len = 0; + for (i = 0; i < iovLen; i++) { + copy_len = MIN(left_len, iov[i].iov_len); + left_len -= copy_len; + memcpy(iov[i].iov_base, (((char *)buff) + pos), copy_len); + iov[i].iov_len = copy_len; + pos += copy_len; + if (left_len == 0) { + break; + } + } + + return left_len; +} + +int mpi_fs_view_read(int fd, u32 iovcnt, struct iovec *iov, off_t offset) +{ + int ret = -1; + u32 leftLen = 0; + + if (iov == NULL || iovcnt == 0) { + errno = EINVAL; + return -1; + } + + mpi_fh_handle_t *fh = mpi_fh_get(fd); + if (fh == NULL) { + errno = EINVAL; + return -1; + } + + uint32_t readLen = 0; + uint32_t i; + for (i = 0; i < iovcnt; i++) { + readLen += iov[i].iov_len; + } + + pthread_mutex_lock(&fh->lock); + + uint32_t count; + count = GetBlockCntInMPIView(offset, readLen, fh->view); + if (count == 0) { + pthread_mutex_unlock(&fh->lock); + mpi_fh_put(fh); + return -1; + } + + mpi_fs_view_read_t *viewRead = AllocMPIViewRead(offset, readLen, count); + if (viewRead == NULL) { + pthread_mutex_unlock(&fh->lock); + mpi_fh_put(fh); + return -1; + } + + MakeMPIViewReadRange(viewRead, fh->view); + + pthread_mutex_unlock(&fh->lock); + + ret = ioctl(fd, ODCS_IOC_MPI_VIEW_READ, viewRead); + if (ret == 0) { + leftLen = CopyBuffToIov(viewRead->data, viewRead->readLen, iov, iovcnt); + if (leftLen != 0) { + ret = -1; + } + + ret = viewRead->readLen; + } + + mpi_free(viewRead); + + mpi_fh_put(fh); + return ret; +} + +int mpi_fs_get_group_id(int fd, uint64_t *group_id) +{ + if (group_id == NULL) { + errno = EINVAL; + return -1; + } + + mpi_fs_group_id_t group; + group.fd = fd; + int ret = ioctl(fd, ODCS_IOC_IOCTL_MPI_GROUP_ID, &group); + if (ret) { + *group_id = 0; + return ret; + } + + *group_id = group.group_id; + return ret; +} + +int mpi_fs_set_group_id(int fd, uint64_t group_id) +{ + mpi_fs_group_id_t group; + group.fd = fd; + group.group_id = group_id; + return ioctl(fd, ODCS_IOC_IOCTL_MPI_SET_GROUP_ID, &group); +} + diff --git a/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.h b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.h new file mode 100644 index 00000000000..8973994d0d5 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/ad_oceanfs/mpi_fs_intf.h @@ -0,0 +1,83 @@ +#ifndef _MPI_FS_INTERFACE_H_ +#define _MPI_FS_INTERFACE_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned int u32; + +/* +#define EPERM 1 Operation not permitted +#define ENOEN 2 No such file or directory +#define ESRCH 3 No such process +#define EINTR 4 Interrupted system call +#define EIO 5 I/O error +#define ENXIO 6 No such device or address +#define E2BIG 7 Argument list too long +#define ENOEX 8 Exec format error +#define EBADF 9 Bad file number +#define ECHIL 10 No child processes +#define EAGAI 11 Try again +#define ENOME 12 Out of memory +#define EACCE 13 Permission denied +#define EFAUL 14 Bad address +#define ENOTB 15 Block device required +#define EBUSY 16 Device or resource busy +#define EEXIS 17 File exists +#define EXDEV 18 Cross-device link +#define ENODE 19 No such device +#define ENOTD 20 Not a directory +#define EISDI 21 Is a directory +#define EINVA 22 Invalid argument +#define ENFIL 23 File table overflow +#define EMFIL 24 Too many open files +#define ENOTT 25 Not a typewriter +#define ETXTB 26 Text file busy +#define EFBIG 27 File too large +#define ENOSP 28 No space left on device +#define ESPIP 29 Illegal seek +#define EROFS 30 Read-only file system +#define EMLIN 31 Too many links +#define EPIPE 32 Broken pipe +#define EDOM 33 Math argument out of domain of func +#define ERANG 34 Math result not representable +*/ + +#define FS_MAX_XATTR_NAME_LEN 256UL +#define FS_MAX_NAME_LEN 1024UL +#define FS_MAX_PATH_LENGTH 4096UL +#define FS_MAX_LONG_NAME_LEN 1024UL + +int mpi_fs_open(const char *pathname, int flags, mode_t mode); + +int mpi_fs_pread(int fd, void *buf, size_t count, off_t offset); + +int mpi_fs_pwrite(int fd, const void *buf, size_t count, off_t offset); + +int mpi_fs_stat(const char *pathname, struct stat *buf); + +int mpi_fs_close(int fd); + +int mpi_fs_ftruncate(int fd, off_t length); + +off_t mpi_fs_lseek(int fd, off_t offset, int whence); + +int mpi_fs_set_fileview(int fd, off_t offset, u32 count, u32 *blocklens, off_t *blockoffs, off_t ub_off); + +int mpi_fs_view_read(int fd, u32 iovcnt, struct iovec *iov, off_t offset); + +int mpi_fs_get_group_id(int fd, uint64_t *group_id); + +int mpi_fs_set_group_id(int fd, uint64_t group_id); + +#endif diff --git a/ompi/mca/io/romio321/romio/adio/common/Makefile.mk b/ompi/mca/io/romio321/romio/adio/common/Makefile.mk index 80194efe483..8343cc9e043 100644 --- a/ompi/mca/io/romio321/romio/adio/common/Makefile.mk +++ b/ompi/mca/io/romio321/romio/adio/common/Makefile.mk @@ -72,5 +72,6 @@ romio_other_sources += \ adio/common/ad_threaded_io.c \ adio/common/p2p_aggregation.c \ adio/common/onesided_aggregation.c \ - adio/common/utils.c + adio/common/utils.c \ + adio/common/ad_env.c diff --git a/ompi/mca/io/romio321/romio/adio/common/ad_env.c b/ompi/mca/io/romio321/romio/adio/common/ad_env.c new file mode 100644 index 00000000000..60fe2b2d743 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/common/ad_env.c @@ -0,0 +1,202 @@ +#include "ad_env.h" +#include "mpi.h" + +/* page mpiio_vars MPIIO Configuration section env_sec Environment Variables + * - OCEANFSMPIO_COMM - Define how data is exchanged on collective + * reads and writes. Possible values: + * - 0 - Use MPI_Alltoallv. + * - 1 - Use MPI_Isend/MPI_Irecv. + * - Default is 1. + * + * - OCEANFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls. + * Possible values: + * - 0 - Do not collect/report timing. + * - 1 - Collect/report timing. + * - Default is 0. + * + * - OCEANFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated + * for aggregator collective i/o. Possible values: + * - 0 - Use two MPI_Allgather's to collect starting and ending offsets. + * - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets. + * - Default is 0(as GEN). + * + * - OCEANFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are + * calculated (block size). Possible values: + * - 0 - Evenly calculate file domains across aggregators. Also use + * MPI_Isend/MPI_Irecv to exchange domain information. + * - 1 - Align file domains with the underlying file system's block size. Also use + * MPI_Alltoallv to exchange domain information. + * - Default is 0(as GEN). + * + * - OCEANFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a + * pthread is spawned to do the posix writes while the main thread does the + * data aggregation - useful for large files where multiple rounds are + * required (more that the cb_buffer_size of data per aggregator). User + * must ensure there is hw resource available for the thread to run. I + * am sure there is a better way to do this involving comm threads - this is + * just a start. NOTE: For some reason the stats collected when this is + * enabled misses some of the data so the data sizes are off a bit - this is + * a statistical issue only, the data is still accurately written out + * + * - OCEANFSMPIO_P2PCONTIG - Does simple point-to-point communication between the + * aggregator and the procs that feed it. Performance could be enhanced by a + * one-sided put algorithm. Current implementation allows only 1 round of + * data. Useful/allowed only when: + * 1.) The datatype is contiguous. + * 2.) The offsets are increasing in rank-order. + * 3.) There are no gaps between the offsets. + * 4.) No single rank has a data size which spans multiple file domains. + * + * - OCEANFSMPIO_WRITE_AGGMETHOD/OCEANFSMPIO_READ_AGGMETHOD - Replaces the two-phase + * collective IO aggregation + * with a one-sided algorithm, significantly reducing communication and + * memory overhead. Fully + * supports all datasets and datatypes, the only caveat is that any holes in the data + * when writing to a pre-existing file are ignored -- there is no read-modify-write + * support to maintain the correctness of regions of pre-existing data so every byte + * must be explicitly written to maintain correctness. Users must beware of middle-ware + * libraries like PNETCDF which may count on read-modify-write functionality for certain + * features (like fill values). Possible values: + * - 0 - Normal two-phase collective IO is used. + * - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data + * for a compute to write to or read from the collective buffer on the aggregator. + * - 2 - An MPI derived datatype is created using all the contigous chunks and just one + * call to MPI_Put or MPI_Get is done with the derived datatype. On Blue Gene /Q + * optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1. + * - Default is 0(as GEN). + * + * - OCEANFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (OCEANFSMPIO_WRITE_AGGMETHOD = 1 or 2) + * disable the detection of holes in the data when writing to a pre-existing + * file requiring a read-modify-write, thereby avoiding the communication + * overhead for this detection. + * - 0 (hole detection enabled) or 1 (hole detection disabled) + * - Default is 0 + * + * - OCEANFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation + * (OCEANFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing + * the user whether holes exist in the data when writing to a pre-existing + * file requiring a read-modify-write, thereby educating the user to set + * OCEANFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication + * overhead for this detection. + * - 0 (disabled) or 1 (enabled) + * - Default is 0 + * + * - OCEANFSMPIO_DEVNULLIO - do everything *except* write to / read from the file + * system. When experimenting with different two-phase I/O strategies, it's + * helpful to remove the highly variable file system from the experiment. + * - 0 (disabled) or 1 (enabled) + * - Default is 0 + * + */ +static int g_init = 0; +static int g_oceanfsmpio_comm = 1; +static int g_oceanfsmpio_timing = 0; +static int g_oceanfsmpio_tunegather = 0; +static int g_oceanfsmpio_tuneblocking = 0; +static int g_oceanfsmpio_pthreadio = 0; +static int g_oceanfsmpio_p2pcontig = 0; +static int g_oceanfsmpio_write_aggmethod = 0; +static int g_oceanfsmpio_read_aggmethod = 0; +static int g_oceanfsmpio_onesided_no_rmw = 0; +static int g_oceanfsmpio_onesided_always_rmw = 0; +static int g_oceanfsmpio_onesided_inform_rmw = 0; +static int g_group_lock_enable = 0; +static int g_log_level = 1; + +static int safe_atoi(char *str, int def) +{ + return (NULL == str) ? def : atoi(str); +} + +void ad_oceanfs_get_env_vars() +{ + if (g_init > 0) { + return; + } + + g_init = 1; + + g_oceanfsmpio_comm = safe_atoi(getenv("OCEANFSMPIO_COMM"), 1); + g_oceanfsmpio_timing = safe_atoi(getenv("OCEANFSMPIO_TIMING"), 0); + g_oceanfsmpio_tunegather = safe_atoi(getenv("OCEANFSMPIO_TUNEGATHER"), 0); + g_oceanfsmpio_tuneblocking = safe_atoi(getenv("OCEANFSMPIO_TUNEBLOCKING"), 0); + g_oceanfsmpio_pthreadio = safe_atoi(getenv("OCEANFSMPIO_PTHREADIO"), 0); + g_oceanfsmpio_p2pcontig = safe_atoi(getenv("OCEANFSMPIO_P2PCONTIG"), 0); + g_oceanfsmpio_write_aggmethod = safe_atoi(getenv("OCEANFSMPIO_WRITE_AGGMETHOD"), 0); + g_oceanfsmpio_read_aggmethod = safe_atoi(getenv("OCEANFSMPIO_READ_AGGMETHOD"), 0); + g_oceanfsmpio_onesided_no_rmw = safe_atoi(getenv("OCEANFSMPIO_ONESIDED_NO_RMW"), 0); + g_oceanfsmpio_onesided_always_rmw = safe_atoi(getenv("OCEANFSMPIO_ONESIDED_ALWAYS_RMW"), 0); + g_oceanfsmpio_onesided_inform_rmw = safe_atoi(getenv("OCEANFSMPIO_ONESIDED_INFORM_RMW"), 0); + + if (g_oceanfsmpio_onesided_always_rmw) { + g_oceanfsmpio_onesided_no_rmw = 1; + } + + g_group_lock_enable = safe_atoi(getenv("OCEANFSMPIO_GROUP_LOCK"), 1); + g_log_level = safe_atoi(getenv("OCEANFSMPIO_LOG_LEVEL"), 1); +} + +int get_oceanfsmpio_timing() +{ + return g_oceanfsmpio_timing; +} +void set_oceanfsmpio_timing(int val) +{ + g_oceanfsmpio_timing = val; +} +int get_oceanfsmpio_comm() +{ + return g_oceanfsmpio_comm; +} +int get_oceanfsmpio_tunegather() +{ + return g_oceanfsmpio_tunegather; +} +int get_oceanfsmpio_tuneblocking() +{ + return g_oceanfsmpio_tuneblocking; +} +int get_oceanfsmpio_pthreadio() +{ + return g_oceanfsmpio_pthreadio; +} +int get_oceanfsmpio_p2pcontig() +{ + return g_oceanfsmpio_p2pcontig; +} +int get_oceanfsmpio_write_aggmethod() +{ + return g_oceanfsmpio_write_aggmethod; +} +int get_oceanfsmpio_read_aggmethod() +{ + return g_oceanfsmpio_read_aggmethod; +} +int get_oceanfsmpio_onesided_no_rmw() +{ + return g_oceanfsmpio_onesided_no_rmw; +} +void set_oceanfsmpio_onesided_no_rmw(int val) +{ + g_oceanfsmpio_onesided_no_rmw = val; +} +int get_oceanfsmpio_onesided_always_rmw() +{ + return g_oceanfsmpio_onesided_always_rmw; +} +void set_oceanfsmpio_onesided_always_rmw(int val) +{ + g_oceanfsmpio_onesided_always_rmw = val; +} +int get_oceanfsmpio_onesided_inform_rmw() +{ + return g_oceanfsmpio_onesided_inform_rmw; +} +int get_group_lock_enable() +{ + return g_group_lock_enable; +} +int get_log_level() +{ + return g_log_level; +} diff --git a/ompi/mca/io/romio321/romio/adio/common/ad_fstype.c b/ompi/mca/io/romio321/romio/adio/common/ad_fstype.c index 75fb24fee66..2891ac505dc 100644 --- a/ompi/mca/io/romio321/romio/adio/common/ad_fstype.c +++ b/ompi/mca/io/romio321/romio/adio/common/ad_fstype.c @@ -82,6 +82,10 @@ # define GPFS_SUPER_MAGIC 0x47504653 #endif +#if defined(ROMIO_OCEANFS) && !defined(OCEANFS_MAGIC) +# define OCEANFS_MAGIC 0xFFEA36969 +#endif + #ifdef ROMIO_HAVE_STRUCT_STATVFS_WITH_F_BASETYPE # ifdef HAVE_SYS_STATVFS_H # include @@ -413,6 +417,13 @@ static void ADIO_FileSysType_fncall(const char *filename, int *fstype, int *erro } # endif +# ifdef ROMIO_OCEANFS + if (fsbuf.f_type == OCEANFS_MAGIC) { + *fstype = ADIO_OCEANFS; + return; + } +# endif + # ifdef ROMIO_UFS /* if UFS support is enabled, default to that */ *fstype = ADIO_UFS; @@ -553,6 +564,10 @@ static void ADIO_FileSysType_prefix(const char *filename, int *fstype, int *erro !strncmp(filename, "ZOIDFS:", 7)) { *fstype = ADIO_ZOIDFS; } + else if (!strncmp(filename, "oceanfs:", 8)|| + !strncmp(filename, "OCEANFS:", 8)) { + *fstype = ADIO_OCEANFS; + } else if (!strncmp(filename, "testfs:", 7) || !strncmp(filename, "TESTFS:", 7)) { @@ -853,6 +868,14 @@ void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype, *ops = &ADIO_LUSTRE_operations; #endif } + if (file_system == ADIO_OCEANFS) { +#ifndef ROMIO_OCEANFS + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iofstypeunsupported", 0); + return; +#else + *ops = &ADIO_OCEANFS_operations; +#endif + } if (file_system == ADIO_ZOIDFS) { #ifndef ROMIO_ZOIDFS *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, diff --git a/ompi/mca/io/romio321/romio/adio/common/ad_set_view.c b/ompi/mca/io/romio321/romio/adio/common/ad_set_view.c index 18becf269a9..39718f5fcd1 100644 --- a/ompi/mca/io/romio321/romio/adio/common/ad_set_view.c +++ b/ompi/mca/io/romio321/romio/adio/common/ad_set_view.c @@ -59,7 +59,9 @@ void ADIO_Set_view(ADIO_File fd, ADIO_Offset disp, MPI_Datatype etype, /* reset MPI-IO file pointer to point to the first byte that can be accessed in this view. */ - +#ifdef ROMIO_OCEANFS + ADIOI_FILESYSTEM_VIEW(fd, error_code); +#endif ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (filetype_is_contig) fd->fp_ind = disp; else { diff --git a/ompi/mca/io/romio321/romio/adio/common/onesided_aggregation.c b/ompi/mca/io/romio321/romio/adio/common/onesided_aggregation.c index 71a5b7bee6e..5cbc1bef2f9 100644 --- a/ompi/mca/io/romio321/romio/adio/common/onesided_aggregation.c +++ b/ompi/mca/io/romio321/romio/adio/common/onesided_aggregation.c @@ -15,6 +15,10 @@ int gpfsmpio_write_aggmethod = 0; int gpfsmpio_read_aggmethod = 0; int gpfsmpio_onesided_always_rmw = 0; #endif +#ifdef ROMIO_OCEANFS +/* extend this to oceanfs file systems */ +#include "../ad_oceanfs/ad_oceanfs_tuning.h" +#endif #include @@ -198,6 +202,13 @@ void ADIOI_OneSidedWriteAggregation(ADIO_File fd, { int i,j; /* generic iterators */ +#ifdef ROMIO_OCEANFS + /* replace gpfsmpioxxx with oceanfsmpioxxx */ + gpfsmpio_onesided_no_rmw = get_oceanfsmpio_onesided_no_rmw(); + gpfsmpio_write_aggmethod = get_oceanfsmpio_write_aggmethod(); + gpfsmpio_read_aggmethod = get_oceanfsmpio_read_aggmethod(); + gpfsmpio_onesided_always_rmw = get_oceanfsmpio_onesided_always_rmw(); +#endif #ifdef onesidedtrace if (buf == NULL) { printf("ADIOI_OneSidedWriteAggregation - buf is NULL contig_access_count is %d\n",contig_access_count); @@ -328,7 +339,13 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of coll_bufsize = (ADIO_Offset)(fd->hints->cb_buffer_size/2); } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() == 1) { + /* split buffer in half for a kind of double buffering with the threads*/ + coll_bufsize = (ADIO_Offset)(fd->hints->cb_buffer_size / 2); + } +#endif /* This logic defines values that are used later to determine what offsets define the portion * of the file domain the agg is writing this round. */ @@ -765,7 +782,13 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of io_thread = pthread_self(); } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() && (numberOfRounds > 1)) { + useIOBuffer = 1; + io_thread = pthread_self(); + } +#endif /* use the write buffer allocated in the file_open */ char *write_buf0 = fd->io_buf; char *write_buf1 = fd->io_buf + coll_bufsize; @@ -1233,6 +1256,13 @@ void ADIOI_OneSidedReadAggregation(ADIO_File fd, { int i,j; /* generic iterators */ +#ifdef ROMIO_OCEANFS + /* replace gpfsmpioxxx with oceanfsmpioxxx */ + gpfsmpio_onesided_no_rmw = get_oceanfsmpio_onesided_no_rmw(); + gpfsmpio_write_aggmethod = get_oceanfsmpio_write_aggmethod(); + gpfsmpio_read_aggmethod = get_oceanfsmpio_read_aggmethod(); + gpfsmpio_onesided_always_rmw = get_oceanfsmpio_onesided_always_rmw(); +#endif #ifdef onesidedtrace if (buf == NULL) { printf("ADIOI_OneSidedWriteAggregation - buf is NULL contig_access_count is %d\n",contig_access_count); @@ -1364,7 +1394,13 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of coll_bufsize = fd->hints->cb_buffer_size/2; } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() == 1) { + /* split buffer in half for a kind of double buffering with the threads*/ + coll_bufsize = fd->hints->cb_buffer_size / 2; + } +#endif /* This logic defines values that are used later to determine what offsets define the portion * of the file domain the agg is reading this round. */ @@ -1784,7 +1820,13 @@ printf("end_offsets[%d] is %ld st_offsets[%d] is %ld\n",j,end_offsets[j],j,st_of io_thread = pthread_self(); } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() && (numberOfRounds > 1)) { + useIOBuffer = 1; + io_thread = pthread_self(); + } +#endif MPI_Win read_buf_window = fd->io_buf_window; ADIO_Offset currentRoundFDStart = 0, nextRoundFDStart = 0; diff --git a/ompi/mca/io/romio321/romio/adio/common/p2p_aggregation.c b/ompi/mca/io/romio321/romio/adio/common/p2p_aggregation.c index ac77a8f876e..ac9afae116d 100644 --- a/ompi/mca/io/romio321/romio/adio/common/p2p_aggregation.c +++ b/ompi/mca/io/romio321/romio/adio/common/p2p_aggregation.c @@ -7,6 +7,7 @@ #include "adio.h" #include "adio_extern.h" #include "../ad_gpfs/ad_gpfs_tuning.h" +#include "../ad_oceanfs/ad_oceanfs_tuning.h" #include @@ -53,7 +54,13 @@ void ADIOI_P2PContigWriteAggregation(ADIO_File fd, coll_bufsize = fd->hints->cb_buffer_size/2; } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() == 1) { + /* split buffer in half for a kind of double buffering with the threads*/ + coll_bufsize = fd->hints->cb_buffer_size / 2; + } +#endif int j; for (j=0;jhints->ranklist[j] == myrank) { @@ -160,6 +167,12 @@ void ADIOI_P2PContigWriteAggregation(ADIO_File fd, io_thread = pthread_self(); } #endif +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() && (numberOfRounds > 1)) { + useIOBuffer = 1; + io_thread = pthread_self(); + } +#endif ADIO_Offset currentRoundFDStart = 0; ADIO_Offset currentRoundFDEnd = 0; @@ -523,7 +536,11 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd, /* share buffer between working threads */ coll_bufsize = coll_bufsize/2; #endif - +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() == 1) + /* share buffer between working threads */ + coll_bufsize = coll_bufsize / 2; +#endif int j; for (j=0;jhints->ranklist[j] == myrank) { @@ -641,7 +658,12 @@ void ADIOI_P2PContigReadAggregation(ADIO_File fd, io_thread = pthread_self(); } #endif - +#ifdef ROMIO_OCEANFS + if(get_oceanfsmpio_pthreadio() && (numberOfRounds > 1)) { + useIOBuffer = 1; + io_thread = pthread_self(); + } +#endif #ifdef ROMIO_GPFS endTimeBase = MPI_Wtime(); gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase); diff --git a/ompi/mca/io/romio321/romio/adio/include/ad_env.h b/ompi/mca/io/romio321/romio/adio/include/ad_env.h new file mode 100644 index 00000000000..129f5a94977 --- /dev/null +++ b/ompi/mca/io/romio321/romio/adio/include/ad_env.h @@ -0,0 +1,27 @@ +#ifndef AD_ENV_H_ +#define AD_ENV_H_ + +#include "adio.h" + +void ad_oceanfs_get_env_vars(); + +/* corresponds to environment variables to select optimizations and timing level */ +int get_oceanfsmpio_timing(); +void set_oceanfsmpio_timing(int val); +int get_oceanfsmpio_timing_cw_level(); +int get_oceanfsmpio_comm(); +int get_oceanfsmpio_tunegather(); +int get_oceanfsmpio_tuneblocking(); +int get_oceanfsmpio_pthreadio(); +int get_oceanfsmpio_p2pcontig(); +int get_oceanfsmpio_write_aggmethod(); +int get_oceanfsmpio_read_aggmethod(); +int get_oceanfsmpio_onesided_no_rmw(); +void set_oceanfsmpio_onesided_no_rmw(int val); +int get_oceanfsmpio_onesided_always_rmw(); +void set_oceanfsmpio_onesided_always_rmw(int val); +int get_oceanfsmpio_onesided_inform_rmw(); +int get_group_lock_enable(); +int get_log_level(); + +#endif /* AD_ENV_H_ */ diff --git a/ompi/mca/io/romio321/romio/adio/include/adio.h b/ompi/mca/io/romio321/romio/adio/include/adio.h index b320fe2788d..ac8da27173a 100644 --- a/ompi/mca/io/romio321/romio/adio/include/adio.h +++ b/ompi/mca/io/romio321/romio/adio/include/adio.h @@ -21,6 +21,12 @@ #ifndef ADIO_INCLUDE #define ADIO_INCLUDE +#include +#include +#include "ad_env.h" + + + #ifdef SPPUX #define _POSIX_SOURCE #endif @@ -305,6 +311,7 @@ typedef struct { #define ADIO_ZOIDFS 167 /* ZoidFS: the I/O forwarding fs */ /* #define ADIO_BG 168 */ #define ADIO_GPFS 168 +#define ADIO_OCEANFS 169 #define ADIO_SEEK_SET SEEK_SET #define ADIO_SEEK_CUR SEEK_CUR diff --git a/ompi/mca/io/romio321/romio/adio/include/adioi.h b/ompi/mca/io/romio321/romio/adio/include/adioi.h index 5656f10bae7..3949f75a90f 100644 --- a/ompi/mca/io/romio321/romio/adio/include/adioi.h +++ b/ompi/mca/io/romio321/romio/adio/include/adioi.h @@ -81,6 +81,9 @@ struct ADIOI_Hints_struct { index in bridgelist */ int numbridges; /* total number of bridges */ } bg; + struct { + int view_io; + } oceanfs; } fs_hints; }; @@ -868,8 +871,9 @@ int MPIOI_File_iread_all(MPI_File fh, char *myname, MPI_Request *request); - - +#ifdef ROMIO_OCEANFS +#define ADIOI_FILESYSTEM_VIEW(fd,error_code) ADIOI_OCEANFS_set_view(fd,error_code) +#endif /* Unix-style file locking */ #if (defined(ROMIO_HFS) || defined(ROMIO_XFS)) @@ -901,6 +905,16 @@ int MPIOI_File_iread_all(MPI_File fh, # define ADIOI_UNLOCK(fd, offset, whence, len) \ ADIOI_Set_lock((fd)->fd_sys, ADIOI_UNLOCK_CMD, LOCKFILE_FAIL_IMMEDIATELY, offset, whence, len) +#elif (defined(ROMIO_OCEANFS)) +#include "../ad_oceanfs/ad_oceanfs.h" + +# define ADIOI_WRITE_LOCK(fd, offset, whence, len) \ + ADIOI_OCEANFS_Set_lock((fd)->fd_sys, F_SETLKW, F_WRLCK, offset, whence, len) +# define ADIOI_READ_LOCK(fd, offset, whence, len) \ + ADIOI_OCEANFS_Set_lock((fd)->fd_sys, F_SETLKW, F_RDLCK, offset, whence, len) +# define ADIOI_UNLOCK(fd, offset, whence, len) \ + ADIOI_OCEANFS_Set_lock((fd)->fd_sys, F_SETLK, F_UNLCK, offset, whence, len) + #else #ifdef ADIOI_MPE_LOGGING diff --git a/ompi/mca/io/romio321/romio/adio/include/adioi_errmsg.h b/ompi/mca/io/romio321/romio/adio/include/adioi_errmsg.h index b1cdb4577a2..272807cae1d 100644 --- a/ompi/mca/io/romio321/romio/adio/include/adioi_errmsg.h +++ b/ompi/mca/io/romio321/romio/adio/include/adioi_errmsg.h @@ -60,6 +60,7 @@ MPI_ERR_IO MPIR_PREALLOC_PERM "To preallocate disk space, ROMIO needs to read the file and write it back, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR." MPIR_ERR_FILETYPE "Filetype must be constructed out of one or more etypes" MPIR_ERR_NO_TESTFS "ROMIO has not been configured to use the TESTFS file system" + MPIR_ERR_NO_OCEANFS "ROMIO has not been configured to use the OCEANFS burst buffer" MPIR_ERR_DEFERRED "independent IO attempted even though no_indep_rw hint given" MPIR_ERR_NO_BGL "ROMIO has not been configured to use the BGL file system" diff --git a/ompi/mca/io/romio321/romio/adio/include/adioi_fs_proto.h b/ompi/mca/io/romio321/romio/adio/include/adioi_fs_proto.h index e3af9170821..0aa4c57d554 100644 --- a/ompi/mca/io/romio321/romio/adio/include/adioi_fs_proto.h +++ b/ompi/mca/io/romio321/romio/adio/include/adioi_fs_proto.h @@ -89,4 +89,9 @@ extern struct ADIOI_Fns_struct ADIO_GRIDFTP_operations; extern struct ADIOI_Fns_struct ADIO_ZOIDFS_operations; #endif +#ifdef ROMIO_OCEANFS +/* prototypes are in adio/ad_oceanfs/ad_oceanfs.h */ +extern struct ADIOI_Fns_struct ADIO_OCEANFS_operations; +#endif + #endif diff --git a/ompi/mca/io/romio321/romio/adio/include/mpio_error.h b/ompi/mca/io/romio321/romio/adio/include/mpio_error.h index f9d22b17219..5d7c7c86189 100644 --- a/ompi/mca/io/romio321/romio/adio/include/mpio_error.h +++ b/ompi/mca/io/romio321/romio/adio/include/mpio_error.h @@ -64,6 +64,7 @@ #define MPIR_ERR_NO_TESTFS 36 #define MPIR_ERR_NO_LUSTRE 37 #define MPIR_ERR_NO_BGL 38 +#define MPIR_ERR_NO_OCEANFS 39 /* MPI_ERR_COMM */ #ifndef MPIR_ERR_COMM_NULL diff --git a/ompi/mca/io/romio321/romio/configure.ac b/ompi/mca/io/romio321/romio/configure.ac index 6d40359beb6..3b4f2cc8f3f 100644 --- a/ompi/mca/io/romio321/romio/configure.ac +++ b/ompi/mca/io/romio321/romio/configure.ac @@ -196,7 +196,7 @@ dnl An m4 macro for use with m4_foreach_w and friends. You should modify this dnl list if you want to add a known file system. The list is just whitespace dnl separated, so you can use newlines and tabs as well. m4_define([known_filesystems_m4_w], - [nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre gpfs zoidfs hfs piofs sfs])dnl + [nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre gpfs zoidfs hfs piofs sfs oceanfs])dnl dnl dnl An m4 macro for use with m4_foreach and friends. Expands to a quoted list of dnl quoted elements. A bit easier to use without unintended expansion than the @@ -1000,6 +1000,15 @@ if test -n "$file_system_zoidfs"; then ) fi +if test -n "$file_system_oceanfs"; then + CFLAGS="$CFLAGS -D_LARGEFILE64_SOURCE" + CPPFLAGS="$CPPFLAGS -I./adio/ad_oceanfs" + AC_CHECK_HEADERS(mpi_fs_intf.h, + AC_DEFINE(ROMIO_OCEANFS,1,[Define for ROMIO with OCEANFS]), + AC_MSG_ERROR([OCEANFS support requested but cannot find mpi_fs_intf.h header file]) + ) +fi + # # Verify presence of pvfs2.h #