diff --git a/Makefile.am b/Makefile.am index 8e116c08a..6054407c4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,8 @@ EXTRA_DIST = COPYRIGHT \ README \ RELEASE_NOTES \ m4/foreach.m4 \ + m4/foreach_idx.m4 \ + m4/list_len.m4 \ m4/utils.m4 # Below is a trick to build all test executables, without running them diff --git a/configure.ac b/configure.ac index a04d669c6..e65563916 100644 --- a/configure.ac +++ b/configure.ac @@ -141,6 +141,9 @@ dnl AH_TEMPLATE([ENABLE_IN_PLACE_SWAP], [Define if to enable in-place byte swap] dnl AH_TEMPLATE([DISABLE_IN_PLACE_SWAP],[Define if to disable in-place byte swap]) AH_TEMPLATE([ENABLE_SUBFILING], [Define if to enable subfiling feature]) AH_TEMPLATE([ENABLE_NETCDF4], [Define if to enable NetCDF-4 support]) +AH_TEMPLATE([ENABLE_CHUNKING], [Define if to enable chunked storage layout and chunking feature]) +AH_TEMPLATE([ENABLE_ZLIB], [Define if to enable zlib chunking method]) +AH_TEMPLATE([ENABLE_SZ], [Define if to enable sz chunking method]) AH_TEMPLATE([ENABLE_ADIOS], [Define if to enable ADIOS BP read feature]) AH_TEMPLATE([HDF5_VER_GE_1_10_4], [Define if HDF5 version is at least 1.10.4]) AH_TEMPLATE([NETCDF_GE_4_5_0], [Define if NetCDF version is at least 4.5.0]) @@ -2506,6 +2509,135 @@ fi AC_SUBST(ENABLE_BURST_BUFFER) AM_CONDITIONAL(ENABLE_BURST_BUFFER, [test x$enable_bbdriver = xyes]) +AC_ARG_ENABLE([chunking], + [AS_HELP_STRING([--enable-chunking], + [Enable chunked chunking driver support. @<:@default: disabled@:>@])], + [enable_chunking=${enableval}], [enable_chunking=no] +) + +ENABLE_CHUNKING=0 +if test "x$enable_chunking" = "xyes" ; then + AC_DEFINE(ENABLE_CHUNKING) + ENABLE_CHUNKING=1 +fi +AC_SUBST(ENABLE_CHUNKING) +AM_CONDITIONAL(ENABLE_CHUNKING, [test x$enable_chunking = xyes]) + +AC_ARG_ENABLE([zlib], + [AS_HELP_STRING([--enable-zlib], + [Enable zlib chunking method support. @<:@default: disabled@:>@])], + [enable_zlib=${enableval}], [enable_zlib=no] +) + +ENABLE_ZLIB=0 +if test "x$enable_zlib" = "xyes" ; then + AC_DEFINE(ENABLE_ZLIB) + ENABLE_ZLIB=1 +fi +AC_SUBST(ENABLE_ZLIB) +AM_CONDITIONAL(ENABLE_ZLIB, [test x$enable_zlib = xyes]) + +if test "x$enable_zlib" = "xyes" ; then + ZLIB_INSTALL="" + AC_ARG_WITH(zlib, + [AS_HELP_STRING([--with-zlib=/path/to/implementation], + [installation prefix for zlib implementation])], + if test "x${withval}" = xyes; then + AC_MSG_ERROR(--with-zlib is set but the value is NULL) + else + ZLIB_INSTALL=${withval} + fi + ) + + if test "x${ZLIB_INSTALL}" != x ; then + CPPFLAGS+=" -I${ZLIB_INSTALL}/include" + LDFLAGS+=" -L${ZLIB_INSTALL}/lib" + LIBS+=" -lz" + fi + + LIBS+=" -lm -ldl" + + have_zlib=no + AC_MSG_CHECKING(ZLIB library) + AC_SEARCH_LIBS([deflate], [z], [have_zlib=yes], [have_zlib=no]) + if test "x${have_zlib}" = xyes; then + AC_CHECK_HEADERS([zlib.h], [], [have_zlib=no]) + fi + + if test "x${have_zlib}" = xno; then + AC_MSG_ERROR([ + ------------------------------------------------------------ + The ZLIB library and header file are required to build + PnetCDF with ZLIB chunking support. Use option + --with-zlib=/path/to/implementation + to specify the location of ZLIB build. + Stopping ... + Check 'config.log' for more information. + ------------------------------------------------------------]) + fi +fi + +AC_ARG_ENABLE([sz], + [AS_HELP_STRING([--enable-sz], + [Enable sz chunking method support. @<:@default: disabled@:>@])], + [enable_sz=${enableval}], [enable_sz=no] +) + +ENABLE_SZ=0 +if test "x$enable_sz" = "xyes" ; then + AC_DEFINE(ENABLE_SZ) + ENABLE_SZ=1 +fi +AC_SUBST(ENABLE_SZ) +AM_CONDITIONAL(ENABLE_SZ, [test x$enable_sz = xyes]) + +has_compression=0 +if test "x${have_zlib}" = xyes || test "x$enable_sz" = "xyes" ; then + has_compression=1 +fi +AC_DEFINE(ENABLE_COMPRESSION, [$has_compression], [Defined if compression is enabled]) +AM_CONDITIONAL(ENABLE_COMPRESSION, [test x$has_compression = x1]) + +if test "x$enable_sz" = "xyes" ; then + SZ_INSTALL="" + AC_ARG_WITH(sz, + [AS_HELP_STRING([--with-sz=/path/to/implementation], + [installation prefix for sz implementation])], + if test "x${withval}" = xyes; then + AC_MSG_ERROR(--with-sz is set but the value is NULL) + else + SZ_INSTALL=${withval} + fi + ) + + if test "x${SZ_INSTALL}" != x ; then + CPPFLAGS+=" -I${SZ_INSTALL}/include" + LDFLAGS+=" -L${SZ_INSTALL}/lib" + LIBS+=" -lSZ -lzstd" + fi + + LIBS+=" -lm -ldl" + + have_sz=no + AC_MSG_CHECKING(SZ library) + AC_SEARCH_LIBS([deflate], [z], [have_sz=yes], [have_sz=no]) + if test "x${have_sz}" = xyes; then + AC_CHECK_HEADERS([sz.h], [], [have_sz=no]) + fi + + if test "x${have_sz}" = xno; then + AC_MSG_ERROR([ + ------------------------------------------------------------ + The SZ library and header file are required to build + PnetCDF with SZ chunking support. Use option + --with-sz=/path/to/implementation + to specify the location of SZ build. + Stopping ... + Check 'config.log' for more information. + ------------------------------------------------------------]) + fi +fi + ADIOS_INSTALL="" AC_ARG_WITH(adios, [AS_HELP_STRING([--with-adios@<:@=DIR@:>@], @@ -2827,6 +2959,7 @@ AC_CONFIG_FILES(Makefile \ src/drivers/nc4io/Makefile \ src/drivers/ncadios/Makefile \ src/drivers/ncbbio/Makefile \ + src/drivers/ncchunkio/Makefile \ src/drivers/ncfoo/Makefile \ src/binding/Makefile \ src/binding/cxx/Makefile \ diff --git a/doc/README.Chunk.md b/doc/README.Chunk.md new file mode 100644 index 000000000..fea6255cf --- /dev/null +++ b/doc/README.Chunk.md @@ -0,0 +1,119 @@ +# Support variable chunking and compression + +PnetCDF contains an experimental variable chunking and compression feature +for classic NetCDF files. + +For details about its design and implementation, please refer to: +Hou, Kaiyuan, et al. "Supporting Data Compression in PnetCDF." +2021 IEEE International Conference on Big Data (Big Data). IEEE, 2021. + +## Enable variable chunking support + +* To build PnetCDF with variable chunking support + + Add `--enable-chunking` option at the configure command line. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking + ``` +* To build deflate filter support for chunked variable + + Add `--enable-zlib` option at the configure command line. Option + `--with-zlib` can also be used to specify the installation path of + zlib if it is not in the standard locations. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-zlib \ + --with-zlib=/zlib/install/path + ``` +* To build sz filter support for chunked variable + + Add `--enable-sz` option at the configure command line. Option + `--with-sz` can also be used to specify the installation path of + sz if it is not in the standard locations. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-sz \ + --with-sz=/sz/install/path + ``` + +## Enable variable chunking + +To enable chunked storage layout for variables, set the file info "nc_chunking" +to "enable". The chunking feature requires 64-bit NetCDF format (CDF5). +For example, +``` + MPI_Info_create(&info); + ncmpi_create(MPI_COMM_WORLD, fname, NC_64BIT_DATA, info, &ncid); +``` +Alternatively, the file info can be set through the environment variable +"PNETCDF_HINTS". +``` +export PNETCDF_HINTS="nc_chunking=enable" +``` +When chunking is enabled, all non-scalar variables will be stored in a chunked +storage layout. Scalar variables are not chunked. + +Users can also set the default filter for chunked variables. For example, +``` + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); +``` +or +``` +export PNETCDF_HINTS="nc_chunking=enable;nc_chunk_default_filter=zlib" +``` +The available filter options are none (default), zlib (deflate), sz. + +## Define chunk dimension of variables + +Applications can use the following APIs to set and get the chunk dimension of +a variable. +``` + int ncmpi_var_set_chunk (int ncid, int varid, int *chunk_dim); + int ncmpi_var_get_chunk (int ncid, int varid, int *chunk_dim); +``` +For example: +``` + int dim[2] = {100, 100}; + int chunk_dim[2] = {10, 10}; + ncmpi_def_var (ncid, name, type, 2, dim, &varid) + ncmpi_var_set_chunk (ncid, varid, chunk_dim); +``` +For record variables, the chunk dimension along the record dimension is always +1. +The default chunk dimension is the dimension of the variable except for the +record dimension. By default, PnetCDF will create one chunk per record or +variable. + +## Define filter for chunked variables + +Applications can use the following APIs to set and get the chunk dimension of +a variable. +``` +#define NC_FILTER_NONE 0 +#define NC_FILTER_DEFLATE 2 +#define NC_FILTER_SZ 3 +int ncmpi_var_set_filter (int ncid, int varid, int filter); +int ncmpi_var_get_filter (int ncid, int varid, int *filter); +``` +For example: +``` + ncmpi_var_set_filter (ncid, varid, NC_FILTER_DEFLATE); +``` +Valid filter values are NC_FILTER_NONE (none), NC_FILTER_DEFLATE (zlib), and +NC_FILTER_SZ (sz). + + +## Known problems + +There are some limitations of the experimental variable chunking feature. + +* Only one filter can be applied to a chunked variable. Unlike HDF5 which allows + the stacking of multiple filters on chunked datasets, the current + implementation in PnetCDF only allows a single filter to be applied to a + variable. +* No per-variable option for variable chunking. If chunking is enabled, all + non-scalar variables will be chunked even if the chunk dimension is not + defined. +* Independent variable I/O is not supported. Variable read/write (get/put) + must be collective in order to maintain data consistency of filtered chunks. + Non-blocking APIs can be used to mitigate the impact of this limitation. + +Copyright (C) 2022, Northwestern University and Argonne National Laboratory + +See the COPYRIGHT notice in the top-level directory. + diff --git a/examples/C/Makefile.am b/examples/C/Makefile.am index a987e578f..e940ab09f 100644 --- a/examples/C/Makefile.am +++ b/examples/C/Makefile.am @@ -42,6 +42,10 @@ check_PROGRAMS = collective_write \ time_var \ create_from_cdl +if ENABLE_COMPRESSION + check_PROGRAMS += chunk_compress chunk_io chunk_2D +endif + if INSTALL_EXAMPLES example_execbin_PROGRAMS = $(check_PROGRAMS) example_execbindir = $(exec_prefix)/pnetcdf_examples/C @@ -84,7 +88,8 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ $(NC_FILES) $(TESTOUTDIR)/pthread.nc.* $(TESTOUTDIR)/testfile.nc -EXTRA_DIST = run_c_examples.sh cdl_header.txt +EXTRA_DIST = run_c_examples.sh cdl_header.txt \ + parallel_run.sh chunk_compress_FLDS.c ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" diff --git a/examples/C/chunk_2D.c b/examples/C/chunk_2D.c new file mode 100644 index 000000000..c199531c4 --- /dev/null +++ b/examples/C/chunk_2D.c @@ -0,0 +1,628 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example shows how to use the chunking and compression features of + * PnetCDF to write a 3D record variable of integer data type in parallel. It + * first defines netCDF variables, each of size NTIMES x NY x NX, where NTIMES, + * NY, and NX are predefined constant. + * + * The data partitioning pattern is a checkerboard style, along both Y and X + * dimensions. Each process writes a subarray per time record. + * + * To compile: + * mpicc -O2 chunk_2D.c -o chunk_2D \ + * -I/path/to/PnetCDF/include \ + * -I/path/to/ZLIB/include \ + * -I/path/to/SZ/include \ + * -L/path/to/PnetCDF/lib \ + * -L/path/to/ZLIB/lib \ + * -L/path/to/SZ/lib \ + * -lpnetcdf -lz -lm -ldl -lSZ -lzstd + * + * Example commands for MPI run and outputs from running ncmpidump on the + * output netCDF file produced by this example program: + * + * % mpiexec -n 4 ./chunk_2D testfile.nc + * + * % ncmpidump testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * time = UNLIMITED ; // (0 currently) + * Y = 10 ; + * X = 10 ; + * _datablock_dim_0 = 131484 ; + * _datablock_dim_1 = 412 ; + * _datablock_dim_2 = 412 ; + * _datablock_dim_3 = 412 ; + * _datablock_dim_4 = 412 ; + * _datablock_dim_5 = 412 ; + * _datablock_dim_6 = 412 ; + * _datablock_dim_7 = 412 ; + * variables: + * int var_0 ; + * var_0:_ndim = 3 ; + * var_0:_dimids = 0, 1, 2 ; + * var_0:_datatype = 4 ; + * var_0:_varkind = 1 ; + * var_0:_chunkdim = 1, 5, 5 ; + * var_0:_filter = 2 ; + * var_0:_metaoffset = 8LL ; + * int var_1 ; + * var_1:_ndim = 3 ; + * var_1:_dimids = 0, 1, 2 ; + * var_1:_datatype = 4 ; + * var_1:_varkind = 1 ; + * var_1:_chunkdim = 1, 5, 5 ; + * var_1:_filter = 2 ; + * var_1:_metaoffset = 65544LL ; + * byte _datablock_0(_datablock_dim_0) ; + * _datablock_0:_varkind = 2 ; + * byte _datablock_1(_datablock_dim_1) ; + * _datablock_1:_varkind = 2 ; + * byte _datablock_2(_datablock_dim_2) ; + * _datablock_2:_varkind = 2 ; + * byte _datablock_3(_datablock_dim_3) ; + * _datablock_3:_varkind = 2 ; + * byte _datablock_4(_datablock_dim_4) ; + * _datablock_4:_varkind = 2 ; + * byte _datablock_5(_datablock_dim_5) ; + * _datablock_5:_varkind = 2 ; + * byte _datablock_6(_datablock_dim_6) ; + * _datablock_6:_varkind = 2 ; + * byte _datablock_7(_datablock_dim_7) ; + * _datablock_7:_varkind = 2 ; + * + * // global attributes: + * :_comressed = 1 ; + * :_nwrite = 8 ; + * :_recsize = 8LL ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include +#include + +#define NTIMES 8 +#define NY 10 +#define NX 10 +#define NVARS 2 + +static int verbose; + +#define PNC_ERR(fname) { \ + if (err != NC_NOERR) { \ + printf("Error at %s:%d when calling %s (%s)\n", __FILE__,__LINE__, \ + fname, ncmpi_strerror(err)); \ + nerrs++; \ + goto err_out; \ + } \ +} + +#define MPI_ERROR(fname) { \ + if (err != MPI_SUCCESS) { \ + int errorStringLen; \ + char errorString[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(err, errorString, &errorStringLen); \ + printf("Error at %s:%d when calling %s (%s)\n", __FILE__,__LINE__, \ + fname, errorString); \ + nerrs++; \ + goto err_out; \ + } \ +} + +#define CALC_START_COUNT(len, nprocs, rank, start, count) { \ + count = len / nprocs; \ + start = count * rank; \ + if (rank < len % nprocs) { \ + start += rank; \ + count++; \ + } \ + else { \ + start += len % nprocs; \ + } \ +} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< compress() >--------------------------------------------------------*/ +static int +compress(MPI_Comm comm, char *filename, int cmode) +{ + char name[64]; + int i, j, rank, nprocs, err, nerrs=0, ncid, varid[NVARS]; + int dimid[3], psize[2], rank_y, rank_x; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + err = MPI_Dims_create(nprocs, 2, psize); + MPI_ERROR("MPI_Dims_create"); + if (verbose && rank == 0) + printf("MPI_Dims_create() 2D: psize=%d %d\n", psize[0],psize[1]); + + /* set rank along X and Y */ + rank_y = rank / psize[1]; + rank_x = rank % psize[1]; + if (verbose && rank == 0) + printf("Local rank 2D: rank_y=%d rank_x=%d\n", rank_y, rank_x); + + /* set chunking (1st dimension should always be 1 for record variable) */ + int chunk_dim[3]; + chunk_dim[0] = 1; + chunk_dim[1] = NY / psize[0]; + if (NY % psize[0]) chunk_dim[1]++; + chunk_dim[2] = NX / psize[1]; + if (NX % psize[1]) chunk_dim[2]++; + if (verbose && rank == 0) + printf("chunk_dim: %d %d %d\n", chunk_dim[0],chunk_dim[1],chunk_dim[2]); + + /* set subarray start and count. Each rank writes a subarray of size + * count[0] x count[1] from offset start[0], start[1], a checkerboard + * partitioning pattern. + */ + CALC_START_COUNT(NY, psize[0], rank_y, start[1], count[1]) + CALC_START_COUNT(NX, psize[1], rank_x, start[2], count[2]) + start[0] = 0; + count[0] = 1; + if (verbose) + printf("rank %d: start=%lld %lld %lld count=%lld %lld %lld\n", rank, + start[0],start[1],start[2], count[0],count[1],count[2]); + + /* allocate write buffer of size count[1] x count[2] */ + int *buf = (int*) malloc(sizeof(int) * count[1] * count[2]); + for (i=0; i-------------------------------------------------------*/ +static int +decompress(MPI_Comm comm, char *filename) +{ + char name[64]; + int i, j, rank, nprocs, err, nerrs=0, ncid, *varid, ulimit_dimid; + int nvars, dimids[3], filter, chunk_dim[3], psize[2], rank_y, rank_x; + MPI_Offset nrecs, global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + err = MPI_Dims_create(nprocs, 2, psize); + MPI_ERROR("MPI_Dims_create"); + if (verbose && rank == 0) + printf("MPI_Dims_create() 2D: psize=%d %d\n", psize[0],psize[1]); + + /* set rank along X and Y */ + rank_y = rank / psize[1]; + rank_x = rank % psize[1]; + if (verbose && rank == 0) + printf("Local rank 2D: rank_y=%d rank_x=%d\n", rank_y, rank_x); + + /* open the file for reading with chunking and compression enabled */ + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); + PNC_ERR("ncmpi_open") + + MPI_Info_free(&info); + + /* obtain dimension info */ + err = ncmpi_inq_dimid(ncid, "Y", &dimids[1]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimid(ncid, "X", &dimids[2]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[1], &global_ny); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[2], &global_nx); + PNC_ERR("ncmpi_inq_dimlen") + + /* obtain the number of record variables */ + err = ncmpi_inq_num_rec_vars(ncid, &nvars); + PNC_ERR("ncmpi_inq_num_rec_vars") + if (verbose && rank == 0) + printf("Number of record variables = %d\n", nvars); + + varid = (int*) malloc(sizeof(int) * nvars); + + /* obtain variable ID and dimension info */ + for (i=0; i---------------------------------------------------*/ +/* Use block-partitioning along time dimension only, i.e. each entire record of + * a variable is read by one process only. Each process may read one or more + * time records of a variable. + */ +static int +partition_time(MPI_Comm comm, char *filename) +{ + char name[64]; + int i, rank, nprocs, err, nerrs=0, ncid, *varid, ulimit_dimid; + int nvars, dimids[3], filter, chunk_dim[3]; + MPI_Offset nrecs, global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* open the file for reading with chunking and compression enabled */ + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); + PNC_ERR("ncmpi_open") + + MPI_Info_free(&info); + + /* obtain dimension info */ + err = ncmpi_inq_dimid(ncid, "Y", &dimids[1]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimid(ncid, "X", &dimids[2]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[1], &global_ny); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[2], &global_nx); + PNC_ERR("ncmpi_inq_dimlen") + + /* obtain the number of record variables */ + err = ncmpi_inq_num_rec_vars(ncid, &nvars); + PNC_ERR("ncmpi_inq_num_rec_vars") + if (verbose && rank == 0) + printf("Number of record variables = %d\n", nvars); + + varid = (int*) malloc(sizeof(int) * nvars); + + /* obtain variable ID and dimension info */ + for (i=0; i +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include /* time() localtime(), asctime() */ +#include +#include + +#define NY 10 +#define NX 4 + +static int verbose; + +#define ERR {if(err!=NC_NOERR){printf("Error at %s:%d : %s\n", __FILE__,__LINE__, ncmpi_strerror(err));nerrs++;}} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< pnetcdf_io() >-------------------------------------------------------*/ +static int +pnetcdf_io(MPI_Comm comm, char *filename, int cmode) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3], buf[NY][NX]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* create a new file for writing ----------------------------------------*/ + cmode |= NC_CLOBBER; + err = ncmpi_create(comm, filename, cmode, info, &ncid); ERR + MPI_Info_free(&info); + + /* the global array is NY * (NX * nprocs) */ + global_ny = NY; + global_nx = NX * nprocs; + + for (i=0; i 0); +} + diff --git a/examples/C/chunk_compress_FLDS.c b/examples/C/chunk_compress_FLDS.c new file mode 100644 index 000000000..58a5493c0 --- /dev/null +++ b/examples/C/chunk_compress_FLDS.c @@ -0,0 +1,362 @@ +/********************************************************************* + * + * Copyright (C) 2026, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * This examples reads the variable FLDS from an output file generated from a + * production run of E3SM Land Model, then compress it using ZLIB and writes + * to a new file. This program is designed to demonstrate the usage of the + * chunking-compression feature of PnetCDF. Note the input file can also be a + * chunked-compressed PnetCDF file. + * + * The FLDS input file has the following metadata. + * + * // file format: CDF-5 (big variables) + * dimensions: + * x = 7814 ; + * y = 8075 ; + * time = UNLIMITED ; // (248 currently) + * variables: + * float x(x) ; + * float y(y) ; + * float time(time) ; + * float FLDS(time, y, x) ; + * FLDS:long_name = "incident longwave radiation" ; + * FLDS:units = "W/m**2" ; + * } + *********************************************************************/ + +#include +#include +#include +#include /* getopt() */ +#include +#include + +static int verbose; + +#define ERR { \ + if (err != NC_NOERR) { \ + printf("Error at %s:%d : %s\n", __FILE__,__LINE__, \ + ncmpi_strerror(err)); \ + nerrs++; \ + goto err_out; \ + } \ +} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h | -q | -t | -c] [-k format] -i in_file -o out_file]n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [-t]: data partitioning along time dimension (default: no)\n" + " [-c]: use cyclic partitioning pattern, only relevant when -t is used (default: block)\n" + " -i filename: input netCDF file name\n" + " -o filename: output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_io() >-------------------------------------------------------*/ +static int +pnetcdf_io(MPI_Comm comm, + const char *in_path, + const char *out_path, + int cmode, + int div_time, + int parti) +{ + int i, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3], ntimes, chunk_dim[3]; + float *buf = NULL, *buf_ptr; + double timing[2], max_t[2]; + MPI_Offset tlen, ylen, xlen, start[3], count[3], amnt[2], sum_amnt[2]; + MPI_Info info=MPI_INFO_NULL; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* open the input file file */ + err = ncmpi_open(comm, in_path, NC_NOWRITE, info, &ncid); ERR + err = ncmpi_inq_dimid(ncid, "time", &dimid[0]); ERR + err = ncmpi_inq_dimid(ncid, "y", &dimid[1]); ERR + err = ncmpi_inq_dimid(ncid, "x", &dimid[2]); ERR + + err = ncmpi_inq_dimlen(ncid, dimid[0], &tlen); ERR + err = ncmpi_inq_dimlen(ncid, dimid[1], &ylen); ERR + err = ncmpi_inq_dimlen(ncid, dimid[2], &xlen); ERR + + err = ncmpi_inq_varid(ncid, "FLDS", &varid); ERR + + if (div_time) { + /* partition along time dimension */ + chunk_dim[0] = 1; + chunk_dim[1] = ylen; + chunk_dim[2] = xlen; + + ntimes = tlen / nprocs; + if (rank < tlen % nprocs) + ntimes++; + + if (parti) { /* block partitioning */ + start[0] = (tlen / nprocs) * rank; + if (rank < tlen % nprocs) + start[0] += rank; + else + start[0] += tlen % nprocs; + } + else + start[0] = rank; /* cyclic partitioning */ + + start[1] = 0; + start[2] = 0; + + count[0] = 1; + count[1] = ylen; + count[2] = xlen; + } + else { + /* checkerboard partitioning on every time step */ + int psize[2], yrank, xrank; + + chunk_dim[0] = 1; + chunk_dim[1] = 1010; + chunk_dim[2] = 977; + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + MPI_Dims_create(nprocs, 2, psize); + + yrank = rank / psize[1]; + xrank = rank % psize[1]; + + if (verbose) { + if (rank == 0) printf("psize %d %d\n", psize[0],psize[1]); + printf("%2d: yrank %d xrank %d\n", rank, yrank, xrank); + } + + ntimes = tlen; + + start[0] = 0; + count[0] = 1; + + count[1] = ylen / psize[0]; + start[1] = count[1] * yrank; + if (yrank < ylen % psize[0]) { + start[1] += yrank; + count[1]++; + } + else + start[1] += ylen % psize[0]; + + count[2] = xlen / psize[1]; + start[2] = count[2] * xrank; + if (xrank < xlen % psize[1]) { + start[2] += xrank; + count[2]++; + } + else + start[2] += xlen % psize[1]; + } + + if (verbose) { + printf("%2d: ntimes %d start %lld %lld %lld count %lld %lld %lld end %lld %lld\n", + rank,ntimes,start[0],start[1],start[2],count[0],count[1],count[2], + start[1]+count[1],start[2]+count[2]); + fflush(stdout); + } + + /* allocate read buffer */ + buf = (float*) malloc(sizeof(float) * ntimes*count[1]*count[2]); + + MPI_Barrier(MPI_COMM_WORLD); + timing[0] = MPI_Wtime(); + + buf_ptr = buf; + for (i=0; i 0); +} + diff --git a/examples/C/chunk_io.c b/examples/C/chunk_io.c new file mode 100644 index 000000000..82d964b54 --- /dev/null +++ b/examples/C/chunk_io.c @@ -0,0 +1,332 @@ +/********************************************************************* + * + * Copyright (C) 2013, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + *********************************************************************/ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example shows how to use the chunking and compression features of + * PnetCDF to write a 3D record variable of integer data type in parallel. It + * first defines a netCDF variable of size + * (NTIMES * nprocs) x (NY * nprocs) x NX + * where NTIMES, NY, and NX are predefined constant. + * The data partitioning pattern for write is along Y dimension. Each process + * writes a subarray of size (NY * NX) per record. + * + * To compile: + * mpicc -O2 chunk_io.c -o chunk_io -lpnetcdf + * + * Example commands for MPI run and outputs from running ncmpidump on the + * output netCDF file produced by this example program: + * + * % mpiexec -n 4 ./chunk_io testfile.nc + * + * % ncmpidump testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * time = UNLIMITED ; // (0 currently) <-- Not used anymore + * Y = 8 ; + * X = 10 ; + * _datablock_dim_0 = 65721 ; + * _datablock_dim_1 = 185 ; + * _datablock_dim_2 = 185 ; + * _datablock_dim_3 = 185 ; + * _datablock_dim_4 = 185 ; + * _datablock_dim_5 = 185 ; + * _datablock_dim_6 = 185 ; + * _datablock_dim_7 = 185 ; + * variables: + * int var ; + * var:_ndim = 3 ; + * var:_dimids = 0, 1, 2 ; + * var:_datatype = 4 ; + * var:_varkind = 1 ; + * var:_chunkdim = 1, 2, 10 ; + * var:_filter = 2 ; + * var:_metaoffset = 4LL ; + * byte _datablock_0(_datablock_dim_0) ; + * _datablock_0:_varkind = 2 ; + * byte _datablock_1(_datablock_dim_1) ; + * _datablock_1:_varkind = 2 ; + * byte _datablock_2(_datablock_dim_2) ; + * _datablock_2:_varkind = 2 ; + * byte _datablock_3(_datablock_dim_3) ; + * _datablock_3:_varkind = 2 ; + * byte _datablock_4(_datablock_dim_4) ; + * _datablock_4:_varkind = 2 ; + * byte _datablock_5(_datablock_dim_5) ; + * _datablock_5:_varkind = 2 ; + * byte _datablock_6(_datablock_dim_6) ; + * _datablock_6:_varkind = 2 ; + * byte _datablock_7(_datablock_dim_7) ; + * _datablock_7:_varkind = 2 ; + * + * // global attributes: + * :_comressed = 1 ; + * :_nwrite = 8 ; + * :_recsize = 8LL ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include /* time() localtime(), asctime() */ +#include +#include + +#define NTIMES 2 +#define NY 2 +#define NX 10 + +static int verbose; + +#define ERR {if(err!=NC_NOERR){printf("Error at %s:%d : %s\n", __FILE__,__LINE__, ncmpi_strerror(err));nerrs++;}} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< compress() >--------------------------------------------------------*/ +static int +compress(MPI_Comm comm, char *filename, int cmode) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* the global array is (NTIMES * nprocs) x (NY * nprocs) x NX */ + + /* set chunking (1st dimension should always be 1 for record variable) */ + + int *buf = (int*) malloc(sizeof(int) * NY * NX); + for (i=0; i-------------------------------------------------------*/ +static int +decompress(MPI_Comm comm, char *filename) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid, filter, chunk_dim[3]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* the global array is (NTIMES * nprocs) x (NY * nprocs) * NX */ + + /* open the file for reading ----------------------------------------*/ + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); ERR + + err = ncmpi_inq_varid(ncid, "var", &varid); ERR + + /* check the current record dimension size */ + MPI_Offset dim_len; + err = ncmpi_inq_unlimdim(ncid, &dimid); ERR + err = ncmpi_inq_dimlen(ncid, dimid, &dim_len); ERR + if (verbose && rank == 0) + printf("Time dimension length = %lld\n", dim_len); + + /* get chunking */ + err = ncmpi_var_get_chunk(ncid, varid, chunk_dim);; ERR + if (verbose && rank == 0) + printf("chunk_dim[3]=%d %d %d\n", + chunk_dim[0],chunk_dim[1],chunk_dim[2]); + + /* get filter */ + err = ncmpi_var_get_filter(ncid, varid, &filter); ERR + if (verbose && rank == 0) + printf("filter is %s\n", (filter == NC_FILTER_DEFLATE) ? + "NC_FILTER_DEFLATE": (filter == NC_FILTER_SZ) ? + "NC_FILTER_SZ" : "UNKNOWN"); + + /* set subarray start and count. Each rank read a whole record at a time + * for NTIMES times. Each process reads different records. + */ + start[0] = rank; start[1] = 0; start[2] = 0; + count[0] = 1; count[1] = NY*nprocs; count[2] = NX; + + if (verbose) + printf("%d: start=%lld %lld %lld count=%lld %lld %lld\n", rank, + start[0], start[1], start[2], count[0], count[1], count[2]); + + int *buf; + buf = (int*) malloc(sizeof(int) * count[0] * count[1] * count[2]); + for (j=0; j 0); +} + diff --git a/m4/foreach_idx.m4 b/m4/foreach_idx.m4 new file mode 100644 index 000000000..fccde380c --- /dev/null +++ b/m4/foreach_idx.m4 @@ -0,0 +1,7 @@ +divert(`-1') +# foreach_idx(x, idx, (item_1, item_2, ..., item_n), stmt) +# parenthesized list, simple version +define(`foreach_idx', `pushdef(`$1')pushdef(`$2')_foreach_idx($@,0)popdef(`$2')popdef(`$1')') +define(`_arg1', `$1') +define(`_foreach_idx', `ifelse(`$3', `()', `',`define(`$1', _arg1$3)define(`$2', `$5')$4`'$0(`$1', `$2', (shift$3), `$4',incr($5))')') +divert`'dnl \ No newline at end of file diff --git a/m4/libtool.m4 b/m4/libtool.m4 index 707b20f3e..8d323b3ee 100644 --- a/m4/libtool.m4 +++ b/m4/libtool.m4 @@ -115,60 +115,10 @@ func_cc_basename () compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;; distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;; \-*) ;; - mpicc | mpicxx | mpif77 | mpif90 | mpifort | *[[\\/]]mpicc | *[[\\/]]mpicxx | *[[\\/]]mpif77 | *[[\\/]]mpif90 | *[[\\/]]mpifort ) - # MPICH compilers - # eval "$cc_temp -show" < /dev/null >& conftest.ver - # func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - # ${RM} -f conftest.ver - func_cc_basename_result=`$cc_temp -show | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; - mpifccpx | mpiFCCpx | mpifrtpx | *[[\\/]]mpifccpx | *[[\\/]]mpiFCCpx | *[[\\/]]mpifrtpx ) - # MPI compilers based on Fujitsu compilers: fccpx, FCCpx, frtpx - func_cc_basename_result=`$cc_temp -showme | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; - cc | CC | ftn | *[[\\/]]cc | *[[\\/]]CC | *[[\\/]]ftn ) - # For Cray PrgEnv-intel, cc is a wrapper of icc - # For Cray PrgEnv-gnu, cc is a wrapper of gcc - # func_cc_basename_result=`$cc_temp --version |& head -n 1 | cut -d' ' -f1 | xargs basename` - eval "$cc_temp --version" < /dev/null >& conftest.ver - func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - ${RM} -f conftest.ver - if test "x${func_cc_basename_result}" = xicc || - test "x${func_cc_basename_result}" = xicpc || - test "x${func_cc_basename_result}" = xifort || - test "x${func_cc_basename_result}" = xgcc || - test "x${func_cc_basename_result}" = xg++ || - test "x${func_cc_basename_result}" = xgfortran || - test "x${func_cc_basename_result}" = xGNU ; then - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - fi - # For Cray PrgEnv-cray, cc is a wrapper of Cray CC - # Cray cc -V sends the output to stderr. - # func_cc_basename_result=`$cc_temp -V |& head -n 1 | cut -d' ' -f1 | xargs basename` - eval "$cc_temp -V" < /dev/null >& conftest.ver - func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - ${RM} -f conftest.ver - if test "x${func_cc_basename_result}" = xCray ; then - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - fi - return - ;; - mpixlc | mpixlcxx | mpixlf77 | mpixlf90 | *[[\\/]]mpixlc | *[[\\/]]mpixlcxx | *[[\\/]]mpixlf77 | *[[\\/]]mpixlf90 ) - func_cc_basename_result=`$cc_temp -show | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; *) break;; esac done func_cc_basename_result=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" } ])# _LT_PREPARE_CC_BASENAME @@ -775,7 +725,7 @@ _LT_CONFIG_SAVE_COMMANDS([ cfgfile=${ofile}T trap "$RM \"$cfgfile\"; exit 1" 1 2 15 - $RM -f "$cfgfile" + $RM "$cfgfile" cat <<_LT_EOF >> "$cfgfile" #! $SHELL @@ -1007,7 +957,7 @@ ac_outfile=conftest.$ac_objext echo "$lt_simple_compile_test_code" >conftest.$ac_ext eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err _lt_compiler_boilerplate=`cat conftest.err` -$RM -f conftest* +$RM conftest* ])# _LT_COMPILER_BOILERPLATE @@ -1062,7 +1012,7 @@ m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[ _lt_result=$? # If there is a non-empty error log, and "single_module" # appears in it, assume the flag caused a linker warning - if test -s conftest.err && $GREP single_module conftest.err > /dev/null ; then + if test -s conftest.err && $GREP single_module conftest.err; then cat conftest.err >&AS_MESSAGE_LOG_FD # Otherwise, if the output was created with a 0 exit code from # the compiler, it worked. @@ -1200,10 +1150,7 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES], _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=$_lt_dar_allow_undefined case $cc_basename in - ifort*|nagfor*) - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _lt_dar_can_shared=yes - ;; + ifort*|nagfor*) _lt_dar_can_shared=yes ;; *) _lt_dar_can_shared=$GCC ;; esac if test yes = "$_lt_dar_can_shared"; then @@ -1214,8 +1161,8 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES], _LT_TAGVAR(module_expsym_cmds, $1)="$SED -e 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dar_export_syms$_lt_dsymutil" m4_if([$1], [CXX], [ if test yes = "$_lt_dar_needs_single_mod" -a yes != "$lt_cv_apple_cc_single_mod"; then - _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs $nostdlib_flag -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil" - _LT_TAGVAR(archive_expsym_cmds, $1)="$SED 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs $nostdlib_flag -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil" + _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil" + _LT_TAGVAR(archive_expsym_cmds, $1)="$SED 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil" fi ],[]) else @@ -2190,7 +2137,7 @@ AC_CACHE_CHECK([if $compiler supports -c -o file.$ac_objext], $RM out/* && rmdir out cd .. $RM -r conftest - $RM -f conftest* + $RM conftest* ]) _LT_TAGDECL([compiler_c_o], [lt_cv_prog_compiler_c_o], [1], [Does compiler simultaneously support -c and -o options?]) @@ -2400,10 +2347,6 @@ if test yes = "$GCC"; then *) lt_sed_strip_eq='s|=/|/|g' ;; esac lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq` - case $cc_basename in - fccpx* | FCCpx* ) lt_search_path_spec=`$CC --showme:libdirs` ;; - *) lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq` ;; - esac case $lt_search_path_spec in *\;*) # if the path contains ";" then we assume it to be the separator @@ -3437,19 +3380,8 @@ AC_ARG_WITH([gnu-ld], [test no = "$withval" || with_gnu_ld=yes], [with_gnu_ld=no])dnl -_LT_CC_BASENAME($CC) - ac_prog=ld -# special care for Fujitsu C or C++ compilers -if test "$cc_basename" = fccpx || test "$cc_basename" = FCCpx ; then - if test yes = "$with_gnu_ld" || test "$host_os" = linux-gnu ; then - ac_prog=`($CC -Xg -print-prog-name=ld) 2>&5` - test -z "$LD" && LD=$ac_prog - with_gnu_ld=yes - fi -fi - -if test "$ac_prog" = ld && test yes = "$GCC" ; then +if test yes = "$GCC"; then # Check if gcc -print-prog-name=ld gives a path. AC_MSG_CHECKING([for ld used by $CC]) case $host in @@ -3569,7 +3501,7 @@ case $host_os in ;; darwin*) if test yes = "$GCC"; then - reload_cmds='$LTCC $LTCFLAGS $nostdlib_flag -nostdlib $wl-r -o $output$reload_objs' + reload_cmds='$LTCC $LTCFLAGS -nostdlib $wl-r -o $output$reload_objs' else reload_cmds='$LD$reload_flag -o $output$reload_objs' fi @@ -4406,12 +4338,7 @@ m4_if([$1], [CXX], [ # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' fi - if test "$cc_basename" = FCCpx ; then # Fujitsu C++ - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - else - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' - fi + _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; amigaos*) @@ -4609,12 +4536,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink' ;; - FCCpx* ) - # Fujitsu C++ - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; *) case `$CC -V 2>&1 | $SED 5q` in *Sun\ C*) @@ -4842,10 +4763,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)="-Xcompiler $_LT_TAGVAR(lt_prog_compiler_pic, $1)" fi ;; - fccpx*) # Fujitsu C Compiler - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; esac else # PORTME Check for flag to pass linker flags through the system compiler. @@ -4935,18 +4852,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)='--shared' _LT_TAGVAR(lt_prog_compiler_static, $1)='--static' ;; - frtpx* ) - # Fujitsu Fortran compiler - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Kstatic_fjlib' - ;; - fccpx* | FCCpx* ) - # Fujitsu C or C++ compiler - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; nagfor*) # NAG Fortran compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,' @@ -5454,8 +5359,6 @@ _LT_EOF _LT_TAGVAR(whole_archive_flag_spec, $1)= tmp_sharedflag='--shared' ;; nagfor*) # NAGFOR 5.3 - _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' - _LT_TAGVAR(compiler_needs_object, $1)=yes tmp_sharedflag='-Wl,-shared' ;; xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below) tmp_sharedflag='-qmkshrobj' @@ -6344,7 +6247,7 @@ x|xyes) # to ld, don't add -lc before -lgcc. AC_CACHE_CHECK([whether -lc should be explicitly linked in], [lt_cv_]_LT_TAGVAR(archive_cmds_need_lc, $1), - [$RM -f conftest* + [$RM conftest* echo "$lt_simple_compile_test_code" > conftest.$ac_ext if AC_TRY_EVAL(ac_compile) 2>conftest.err; then @@ -6371,7 +6274,7 @@ x|xyes) else cat conftest.err 1>&5 fi - $RM -f conftest* + $RM conftest* ]) _LT_TAGVAR(archive_cmds_need_lc, $1)=$lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1) ;; @@ -6653,8 +6556,8 @@ if test yes != "$_lt_caught_CXX_error"; then # Check if GNU C++ uses GNU ld as the underlying linker, since the # archiving commands below assume that GNU ld is being used. if test yes = "$with_gnu_ld"; then - _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' - _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' + _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' @@ -6665,8 +6568,6 @@ if test yes != "$_lt_caught_CXX_error"; then wlarc='$wl' # ancient GNU ld didn't support --whole-archive et. al. - # TODO: when using FCCpx, need to run command `$CC -Xg -print-prog-name=ld` - # to get the linker, LD. if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive' else @@ -6681,7 +6582,7 @@ if test yes != "$_lt_caught_CXX_error"; then # linker, instead of GNU ld. If possible, this setting should # overridden to take advantage of the native linker features on # the platform it is being used on. - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' fi # Commands to make compiler produce verbose output that lists @@ -6952,7 +6853,7 @@ if test yes != "$_lt_caught_CXX_error"; then _LT_TAGVAR(file_list_spec, $1)='@' if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' # If the export-symbols file already is a .def file, use it as # is; otherwise, prepend EXPORTS... _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then @@ -6961,7 +6862,7 @@ if test yes != "$_lt_caught_CXX_error"; then echo EXPORTS > $output_objdir/$soname.def; cat $export_symbols >> $output_objdir/$soname.def; fi~ - $CC -shared $nostdlib_flag -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' + $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi @@ -7068,7 +6969,7 @@ if test yes != "$_lt_caught_CXX_error"; then ;; *) if test yes = "$GXX"; then - _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' + _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' else # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no @@ -7136,13 +7037,13 @@ if test yes != "$_lt_caught_CXX_error"; then if test no = "$with_gnu_ld"; then case $host_cpu in hppa*64*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; ia64*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; *) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; esac fi @@ -7183,9 +7084,9 @@ if test yes != "$_lt_caught_CXX_error"; then *) if test yes = "$GXX"; then if test no = "$with_gnu_ld"; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' else - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib' fi fi _LT_TAGVAR(link_all_deplibs, $1)=yes @@ -7459,10 +7360,10 @@ if test yes != "$_lt_caught_CXX_error"; then _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' case $host in osf3*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; *) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; esac @@ -7552,9 +7453,9 @@ if test yes != "$_lt_caught_CXX_error"; then if test yes,no = "$GXX,$with_gnu_ld"; then _LT_TAGVAR(no_undefined_flag, $1)=' $wl-z ${wl}defs' if $CC --version | $GREP -v '^2\.7' > /dev/null; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ - $CC -shared $pic_flag $nostdlib_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' + $CC -shared $pic_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when @@ -7563,9 +7464,9 @@ if test yes != "$_lt_caught_CXX_error"; then else # g++ 2.7 appears to require '-G' NOT '-shared' on this # platform. - _LT_TAGVAR(archive_cmds, $1)='$CC -G $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ - $CC -G $nostdlib_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' + $CC -G -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when @@ -7843,7 +7744,7 @@ if AC_TRY_EVAL(ac_compile); then # The "-l" case would never come before the object being # linked, so don't bother handling this case. esac - elif test "x$p" != x ; then # skip if $p is empty + else if test -z "$_LT_TAGVAR(postdeps, $1)"; then _LT_TAGVAR(postdeps, $1)=$prev$p else @@ -7922,14 +7823,6 @@ _LT_TAGDECL([], [postdeps], [1]) _LT_TAGDECL([], [compiler_lib_search_path], [1], [The library search path used internally by the compiler when linking a shared library]) - -ac_nostdlib_flag= -# Fujitsu compilers -if test "$cc_basename" == FCCpx || test "$cc_basename" == fccpx || test "$cc_basename" == frtpx ; then - ac_nostdlib_flag=-Xg -fi -_LT_TAGVAR(nostdlib_flag, $1)=$ac_nostdlib_flag -_LT_TAGDECL([], [nostdlib_flag], [1]) ])# _LT_SYS_HIDDEN_LIBDEPS diff --git a/m4/list_len.m4 b/m4/list_len.m4 new file mode 100644 index 000000000..7e6590eed --- /dev/null +++ b/m4/list_len.m4 @@ -0,0 +1,6 @@ +divert(`-1') +# list_len((item_1, item_2, ..., item_n)) +# parenthesized list, simple version +define(`list_len', `_list_len($@, 0)')`'dnl +define(`_list_len',`ifelse(`$1', `()', `$2', `$0((shift$1), incr(`$2'))')')`'dnl +divert`'dnl \ No newline at end of file diff --git a/scripts/ltmain.sh b/scripts/ltmain.sh index acd0c1343..3e6a3db3a 100644 --- a/scripts/ltmain.sh +++ b/scripts/ltmain.sh @@ -8325,13 +8325,6 @@ func_mode_link () # Convert "-framework foo" to "foo.ltframework" if test -n "$inherited_linker_flags"; then tmp_inherited_linker_flags=`$ECHO "$inherited_linker_flags" | $SED 's/-framework \([^ $]*\)/\1.ltframework/g'` - - # Additionally convert " -pthread" to " -Wl,-pthread" for nagfor - func_cc_basename $CC - case $func_cc_basename_result in - nagfor*) tmp_inherited_linker_flags=`$ECHO "$tmp_inherited_linker_flags" | $SED 's/ -pthread/ -Wl,-pthread/g'` ;; - esac - for tmp_inherited_linker_flag in $tmp_inherited_linker_flags; do case " $new_inherited_linker_flags " in *" $tmp_inherited_linker_flag "*) ;; @@ -9367,8 +9360,7 @@ func_mode_link () xlcverstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" verstring="-compatibility_version $minor_current -current_version $minor_current.$revision" # On Darwin other compilers - func_cc_basename $CC - case $func_cc_basename_result in + case $CC in nagfor*) verstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" ;; @@ -9875,13 +9867,6 @@ func_mode_link () ;; esac - # Time to revert the changes made for nagfor. - func_cc_basename $CC - case $func_cc_basename_result in - nagfor*) - new_inherited_linker_flags=`$ECHO " $new_inherited_linker_flags" | $SED 's% -Wl,-pthread% -pthread%g'` ;; - esac - # move library search paths that coincide with paths to not yet # installed libraries to the beginning of the library search list new_libs= diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index e54684835..68b3239b4 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -501,6 +501,9 @@ ncmpi_create(MPI_Comm comm, #ifdef ENABLE_BURST_BUFFER int enable_bb_driver=0; #endif +#ifdef ENABLE_CHUNKING + int enable_chk_driver=0; +#endif MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); @@ -584,6 +587,18 @@ ncmpi_create(MPI_Comm comm, enable_bb_driver = 1; } #endif +#ifdef ENABLE_CHUNKING + if (combined_info != MPI_INFO_NULL) { + char value[MPI_MAX_INFO_VAL]; + int flag; + + /* check if nc_chunking is enabled */ + MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag && strcasecmp(value, "enable") == 0) + enable_chk_driver = 1; + } +#endif /* Use environment variable and cmode to tell the file format * which is later used to select the right driver. @@ -664,6 +679,11 @@ ncmpi_create(MPI_Comm comm, if (enable_bb_driver) driver = ncbbio_inq_driver(); else +#endif +#ifdef ENABLE_CHUNKING + if (enable_chk_driver) + driver = ncchkio_inq_driver(); + else #endif /* default is the driver built on top of MPI-IO */ driver = ncmpio_inq_driver(); @@ -766,6 +786,9 @@ ncmpi_open(MPI_Comm comm, #ifdef ENABLE_BURST_BUFFER int enable_bb_driver=0; #endif +#ifdef ENABLE_CHUNKING + int enable_chk_driver=0; +#endif MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); @@ -882,6 +905,18 @@ ncmpi_open(MPI_Comm comm, enable_bb_driver = 1; } #endif +#ifdef ENABLE_CHUNKING + if (combined_info != MPI_INFO_NULL) { + char value[MPI_MAX_INFO_VAL]; + int flag; + + /* check if nc_chunking is enabled */ + MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag && strcasecmp(value, "enable") == 0) + enable_chk_driver = 1; + } +#endif #ifdef ENABLE_NETCDF4 if (format == NC_FORMAT_NETCDF4_CLASSIC || format == NC_FORMAT_NETCDF4) { @@ -910,6 +945,11 @@ ncmpi_open(MPI_Comm comm, if (enable_bb_driver) driver = ncbbio_inq_driver(); else +#endif +#ifdef ENABLE_CHUNKING + if (enable_chk_driver) + driver = ncchkio_inq_driver(); + else #endif { /* ncmpio driver */ diff --git a/src/dispatchers/variable.c b/src/dispatchers/variable.c index ba8c92e16..4d7ca9f0e 100644 --- a/src/dispatchers/variable.c +++ b/src/dispatchers/variable.c @@ -235,6 +235,76 @@ ncmpi_def_var(int ncid, /* IN: file ID */ return NC_NOERR; } +#ifdef ENABLE_COMPRESSION +/*----< ncmpi_var_set_chunk() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + int err; + int ndim; + + err = ncmpi_inq_varndims(ncid,varid, &ndim); + if (err != NC_NOERR) return err; + + return ncmpi_put_att_int(ncid, varid, "_chunkdim", NC_INT, ndim, chunk_dim); +} +/*----< ncmpi_var_get_chunk() >----------------------------------------------------*/ +int ncmpi_var_get_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return ncmpi_get_att_int(ncid, varid, "_chunkdim", chunk_dim); +} +/*----< ncmpi_var_set_filter() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_filter (int ncid, /* IN: file ID */ + int varid, + int filter) +{ + return ncmpi_put_att_int(ncid, varid, "_filter", NC_INT, 1, &filter); +} +/*----< ncmpi_var_get_filter() >----------------------------------------------------*/ +int ncmpi_var_get_filter (int ncid, /* IN: file ID */ + int varid, + int *filter) +{ + return ncmpi_get_att_int(ncid, varid, "_filter", filter); +} +#else +/*----< ncmpi_var_set_chunk() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_get_chunk() >----------------------------------------------------*/ +int ncmpi_var_get_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_set_filter() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_filter (int ncid, /* IN: file ID */ + int varid, + int filter) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_get_filter() >----------------------------------------------------*/ +int ncmpi_var_get_filter (int ncid, /* IN: file ID */ + int varid, + int *filter) +{ + return NC_ENOTBUILT; +} +#endif + /*----< ncmpi_def_var_fill() >-----------------------------------------------*/ /* this API is collective, and must be called in define mode */ int diff --git a/src/drivers/Makefile.am b/src/drivers/Makefile.am index de1a6a092..8829a41ec 100644 --- a/src/drivers/Makefile.am +++ b/src/drivers/Makefile.am @@ -20,11 +20,15 @@ if ENABLE_BURST_BUFFER SUBDIRS += ncbbio endif +if ENABLE_CHUNKING + SUBDIRS += ncchunkio +endif + if ENABLE_ADIOS SUBDIRS += ncadios endif -DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios pncio +DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios pncio ncchunkio # For VPATH build (parallel build), try delete all sub-directories distclean-local: diff --git a/src/drivers/ncchunkio/DEVELOPER_NOTES.md b/src/drivers/ncchunkio/DEVELOPER_NOTES.md new file mode 100644 index 000000000..99ee038f6 --- /dev/null +++ b/src/drivers/ncchunkio/DEVELOPER_NOTES.md @@ -0,0 +1,74 @@ +# Note for Developers + +### Table of contents +- [Future Work] +- [Internal global attributes] +- [Anchor variable (one per variable with chunking enabled)] +- [Reference table] +- [Chunks] +- [Requirement for compressed variables] + +--- + +## Internal global attributes: + * Number of chunked variables + +## Anchor variable (one per variable with chunking enabled): + * A scalar variable + * Data type is the same as user defined + * Internal attributes + + Dimension IDs are saved as an attribute of an array of integer type + + Number of dimensions is saved as an internal attribute + + An attribute to tell whether it is a fixed-size or record variable + + An attribute offset pointer to reference table + * For fixed-size variable, it is a scalar + * For record variable, it is an array of 8-type integers, one for each record + * This array can be allocated in multiple of 16 for example + * Need an integer for allocated size, e.g. multiple of 16 + * Need an integer for size (true number of records written) + + An attributes for chunk sizes, an integer array + + An attributes for compression algorithm + + An attributes for compression level + * If a variable missing these internal attributes, it is a traditional variable + +## Reference table: + * An array stores offsets of individual chunks + * Not a NetCDF variable. But we use the CDF5 format specification to define it + + TODO: give it a formal spec in BNF grammar + * For a fixed-size variable, it is a 1D array of size equal to the number of chunks + * This table is loaded into memory when calling ncmpi_inq_varid + * For blocking API, it is sync-ed and written to file by root + + TODO: in future, it can be written by multiple ranks in parallel + * For nonblocking API, multiple tables are written by multiple ranks in parallel + +## Chunks: + * Chunks are not NetCDF variables + + TODO: give it a formal spec in BNF grammar? + * Chunks are stored in space between NetCDF variables, i.e. padding areas in files + * Data is type-converted and byte-swapped before compression + * In principle, chunks should be stored in file contiguously with each other, + for all variables. But they are not required to be stored contiguously. + * The storage order of chunks is in row major + +## Requirement for compressed variables: + * Collective I/O only (this is the same required by HDF5) + * Must be chunked (same as HDF5) + + +## Future Work +* Reuse metadata accross variables + - Variable from same simulation space may have same access apttern. + - Instead of generating variable metadata and indexx table separately, we can + share information accross variables. + - Chunk sizeand chunk ownership info can be reused. +* Data seiving + - When rewriting to a chunk, we do't need to read the background if it is + fully overwritten. + - Need an efficient way to determine whether a chunk is fully rewrititen. + - It may be infesible due to communication and computation cost. + - HDF5 approximate this by checking if owner fully rewriten the chunk. +* Reuse metadata accross records + - I/O pattern accross time steps are likely the same. + - If we detect same I/O pattern as previous record, we can skip sending the metadata. + - MPI datatype created for previous timestep can also be reused. +--- diff --git a/src/drivers/ncchunkio/Makefile.am b/src/drivers/ncchunkio/Makefile.am new file mode 100644 index 000000000..34ce50d76 --- /dev/null +++ b/src/drivers/ncchunkio/Makefile.am @@ -0,0 +1,95 @@ +# +# Copyright (C) 2012, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# +# $Id: Makefile.am 3283 2017-07-30 21:10:11Z wkliao $ +# +# @configure_input@ + +SUFFIXES = .a .o .c .m4 .h + +AM_CPPFLAGS = -I${top_srcdir}/src/include +AM_CPPFLAGS += -I${top_builddir}/src/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include +AM_CPPFLAGS += -I${top_builddir}/src/drivers/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/ncmpio +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/pncio + +if PNETCDF_DEBUG + AM_CPPFLAGS += -DPNETCDF_DEBUG +endif + +noinst_LTLIBRARIES = libncchkio.la + +M4FLAGS += -I${top_srcdir}/m4 +M4FLAGS += -I${top_srcdir}/src/drivers/ncchunkio +if ENABLE_ERANGE_FILL +M4FLAGS += -DERANGE_FILL +endif + +M4_SRCS = ncchkioi_profile.m4 \ + ncchkioi_convert.m4 + +M4H_SRCS = ncchkioi_profile.m4h + +H_SRCS = ncchkio_driver.h + +C_SRCS = ncchkio_attr.c \ + ncchkio_dim.c \ + ncchkio_driver.c \ + ncchkio_file.c \ + ncchkio_var.c \ + ncchkio_internal.c \ + ncchkioi_util.c \ + ncchkioi_put_var.c \ + ncchkioi_get_var.c \ + ncchkioi_put_varn.c \ + ncchkioi_get_varn.c \ + ncchkioi_iput_cb.c \ + ncchkioi_iget_cb.c \ + ncchkioi_iput.c \ + ncchkioi_iget.c \ + ncchkioi_nonblocking.c \ + ncchkioi_cache.c \ + ncchkioi_chunk.c \ + ncchkioi_chunk_size.c \ + ncchkioi_chunk_owner.c \ + ncchkioi_var_init.c \ + ncchkioi_var_resize.c \ + ncchkioi_var_wr.c \ + ncchkioi_var_rd.c \ + ncchkioi_lists.c \ + ncchkioi_wait.c \ + ncchk_filter_dummy.c + +if ENABLE_ZLIB + C_SRCS += ncchk_filter_zlib.c +endif + +if ENABLE_SZ + C_SRCS += ncchk_filter_sz.c +endif + +$(M4_SRCS:.m4=.c): Makefile +$(M4H_SRCS:.m4h=.h): Makefile + +.m4.c: + $(M4) $(AM_M4FLAGS) $(M4FLAGS) $< >$@ + +.m4h.h: + $(M4) $(AM_M4FLAGS) $(M4FLAGS) $< >$@ + +libncchkio_la_SOURCES = $(C_SRCS) $(H_SRCS) +nodist_libncchkio_la_SOURCES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) + +# automake says "... BUILT_SOURCES is honored only by 'make all', 'make check', +# and 'make install'. This means you cannot build a specific target (e.g., +# 'make target') in a clean tree if it depends on a built source." +BUILT_SOURCES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) + +CLEANFILES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) core core.* *.gcda *.gcno *.gcov gmon.out + +EXTRA_DIST = $(M4_HFILES) $(M4_SRCS) $(M4H_SRCS) ncchkioi_profile_timers.m4 + +tests-local: all + diff --git a/src/drivers/ncchunkio/ncchk_filter_driver.h b/src/drivers/ncchunkio/ncchk_filter_driver.h new file mode 100644 index 000000000..b406587fc --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_driver.h @@ -0,0 +1,29 @@ +#ifndef NCCHK_FILTER_DRIVER_H +#define NCCHK_FILTER_DRIVER_H + +#include + +struct NCCHK_filter { + int (*init)(MPI_Info); + int (*finalize)(); + int (*inq_cpsize)(void*, int, int*, int, int*, MPI_Datatype); + int (*compress)(void*, int, void*, int*, int, int*, MPI_Datatype); + int (*compress_alloc)(void*, int, void**, int*, int, int*, MPI_Datatype); + int (*inq_dcsize)(void*, int, int*, int, int*, MPI_Datatype); + int (*decompress)(void*, int, void*, int*, int, int*, MPI_Datatype); + int (*decompress_alloc)(void*, int, void**, int*, int, int*, MPI_Datatype); +}; + +typedef struct NCCHK_filter NCCHK_filter; + +extern NCCHK_filter* ncchk_dummy_inq_driver(void); + +#if ENABLE_ZLIB +extern NCCHK_filter* ncchk_zlib_inq_driver(void); +#endif + +#if ENABLE_SZ +extern NCCHK_filter* ncchk_sz_inq_driver(void); +#endif + +#endif \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchk_filter_dummy.c b/src/drivers/ncchunkio/ncchk_filter_dummy.c new file mode 100644 index 000000000..59e7b8762 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_dummy.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +int ncchk_dummy_init(MPI_Info info) { + return NC_NOERR; +} + +int ncchk_dummy_finalize() { + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_dummy_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + *out_len = in_len; + return NC_NOERR; +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_dummy_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + if (out_len != NULL){ + // Check output buffer size + if ((*out_len) < in_len){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + // Overwrite output buffer size with actual size + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(out, in, in_len); + + return NC_NOERR; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_dummy_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + // Allocate output buffer + *out = (void*)malloc(in_len); + + // Buffer size + if (out_len != NULL) { + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(*out, in, in_len); + + return NC_NOERR; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_dummy_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + *out_len = in_len; + return NC_NOERR; +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_dummy_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + if (out_len != NULL){ + // Check output buffer size + if ((*out_len) < in_len){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + // Overwrite output buffer size with actual size + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(out, in, in_len); + + return NC_NOERR; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_dummy_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + // Allocate output buffer + *out = (void*)malloc(in_len); + + // Buffer size + if (out_len != NULL) { + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(*out, in, in_len); + + return NC_NOERR; +} + +static NCCHK_filter ncchkio_driver = { + ncchk_dummy_init, + ncchk_dummy_finalize, + ncchk_dummy_inq_cpsize, + ncchk_dummy_compress, + ncchk_dummy_compress_alloc, + ncchk_dummy_inq_dcsize, + ncchk_dummy_decompress, + ncchk_dummy_decompress_alloc +}; + +NCCHK_filter* ncchk_dummy_inq_driver(void) { + return &ncchkio_driver; +} + diff --git a/src/drivers/ncchunkio/ncchk_filter_sz.c b/src/drivers/ncchunkio/ncchk_filter_sz.c new file mode 100644 index 000000000..dc5ac9aa1 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_sz.c @@ -0,0 +1,312 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +static int mpi_to_sz_type(MPI_Datatype dtype){ + if (dtype == MPI_FLOAT){ + return SZ_FLOAT; + } + else if (dtype == MPI_DOUBLE){ + return SZ_DOUBLE; + } + else if (dtype == MPI_BYTE){ + return SZ_UINT8; + } + else if (dtype == MPI_CHAR){ + return SZ_INT8; + } + else if (dtype == MPI_SHORT){ + return SZ_INT16; + } + else if (dtype == MPI_UNSIGNED_SHORT){ + return SZ_UINT16; + } + else if (dtype == MPI_INT){ + return SZ_INT32; + } + else if (dtype == MPI_UNSIGNED){ + return SZ_UINT32; + } + else if (dtype == MPI_LONG_LONG){ + return SZ_INT64; + } + else if (dtype == MPI_UNSIGNED_LONG_LONG){ + return SZ_UINT64; + } + + return -1; +} + +int ncchk_sz_init(MPI_Info info) { + sz_params sz; + + memset(&sz, 0, sizeof(sz_params)); + sz.sol_ID = SZ; + sz.sampleDistance = 50; + sz.quantization_intervals = 0; + sz.max_quant_intervals = 65536; + sz.predThreshold = 0.98; + sz.szMode = SZ_BEST_COMPRESSION; + sz.losslessCompressor = ZSTD_COMPRESSOR; + sz.gzipMode = 1; + sz.errorBoundMode = ABS; + sz.absErrBound = 1E-3; + sz.relBoundRatio = 1E-5; + SZ_Init_Params(&sz); + + return NC_NOERR; +} + +int ncchk_sz_finalize() { + SZ_Finalize(); + + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_sz_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // sz has no size estimation +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_sz_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + int szdtype; + size_t r[4]; + size_t outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + } + + buf = SZ_compress(szdtype, in, &outsize, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // If buffer not large enough + if (*out_len < outsize){ + DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) + goto out; + } + + // Size of comrpessed data + *out_len = outsize; + } + + memcpy(out, buf, outsize); + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_sz_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + int szdtype; + size_t r[4]; + size_t outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + } + + *out = SZ_compress(szdtype, in, &outsize, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // Size of comrpessed data + *out_len = outsize; + } + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_sz_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // sz has no size estimation +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_sz_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + size_t r[4]; + int szdtype; + int outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + MPI_Type_size(dtype, &outsize); + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + outsize *= dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + outsize *= dims[i]; + } + + buf = SZ_decompress(szdtype, in, (size_t)in_len, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // If buffer not large enough + if (*out_len < outsize){ + DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) + goto out; + } + + // Size of comrpessed data + *out_len = outsize; + } + + memcpy(out, buf, outsize); + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_sz_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + size_t r[4]; + int szdtype; + int outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + MPI_Type_size(dtype, &outsize); + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + outsize *= dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + outsize *= dims[i]; + } + + *out = SZ_decompress(szdtype, in, (size_t)in_len, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // Size of comrpessed data + *out_len = outsize; + } + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +static NCCHK_filter ncchk_driver_sz = { + ncchk_sz_init, + ncchk_sz_finalize, + ncchk_sz_inq_cpsize, + ncchk_sz_compress, + ncchk_sz_compress_alloc, + ncchk_sz_inq_dcsize, + ncchk_sz_decompress, + ncchk_sz_decompress_alloc +}; + +NCCHK_filter* ncchk_sz_inq_driver(void) { + return &ncchk_driver_sz; +} + diff --git a/src/drivers/ncchunkio/ncchk_filter_zlib.c b/src/drivers/ncchunkio/ncchk_filter_zlib.c new file mode 100644 index 000000000..3db4094b7 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_zlib.c @@ -0,0 +1,311 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +int ncchk_zlib_init(MPI_Info info) { + return NC_NOERR; +} + +int ncchk_zlib_finalize() { + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_zlib_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // Zlib has no size estimation +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_zlib_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + + // zlib struct + z_stream defstream; + defstream.zalloc = Z_NULL; + defstream.zfree = Z_NULL; + defstream.opaque = Z_NULL; + defstream.avail_in = (uInt)(in_len); // input size + defstream.next_in = (Bytef*)in; // input + if (out_len != NULL){ + defstream.avail_out = (uInt)(*out_len); // output buffer size + } + else{ + defstream.avail_out = (uInt)1000000000; // Assume it is large enough + } + defstream.next_out = (Bytef *)out; // output buffer + + // the actual compression work. + err = deflateInit(&defstream, Z_DEFAULT_COMPRESSION); + if (err != Z_OK){ + printf("deflateInit fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = deflate(&defstream, Z_FINISH); + if (err != Z_STREAM_END){ + printf("deflate fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = deflateEnd(&defstream); + if (err != Z_OK){ + printf("deflateEnd fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // If buffer not large enough + if (defstream.avail_in > 0){ + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = defstream.total_out; + } + + return NC_NOERR; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_zlib_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int bsize; // Start by 1/8 of the in_len + char *buf; + + bsize = in_len >> 3; + if (bsize < 6){ + bsize = 6; + } + buf = (char*)malloc(bsize); + + // zlib struct + z_stream defstream; + defstream.zalloc = Z_NULL; + defstream.zfree = Z_NULL; + defstream.opaque = Z_NULL; + defstream.avail_in = (uInt)(in_len); // input size + defstream.next_in = (Bytef*)in; // input + defstream.avail_out = (uInt)(bsize); // output buffer size + defstream.next_out = (Bytef *)buf; // output buffer + + // Initialize deflat stream + err = deflateInit(&defstream, Z_DEFAULT_COMPRESSION); + if (err != Z_OK){ + printf("deflateInit fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // The actual compression work + err = Z_OK; + while (err != Z_STREAM_END){ + // Compress data + err = deflate(&defstream, Z_NO_FLUSH | Z_FINISH); + // Check if buffer is lage enough + if (err != Z_STREAM_END){ + // Enlarge buffer + buf = (char*)realloc(buf, bsize << 1); + + // Reset buffer info in stream + defstream.next_out = (Bytef *)(buf + bsize); + defstream.avail_out = bsize; + + // Reocrd new buffer size + bsize <<= 1; + } + } + + // Finalize deflat stream + err = deflateEnd(&defstream); + if (err != Z_OK){ + printf("deflateEnd fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = defstream.total_out; + + char *env_str; + if ((env_str = getenv("PNETCDF_COMPRESS_VERBOSE")) != NULL) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + printf("rank %d (%s at %d) compress data size %d into size %d\n", + rank,__func__,__LINE__,in_len,*out_len); + } + } + + // Compressed data + *out = buf; + + return NC_NOERR; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_zlib_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // Zlib has no size estimation +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_zlib_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + infstream.avail_in = (unsigned long) in_len; // input size + infstream.next_in = (Bytef *)in; // input + if (out_len != NULL){ + infstream.avail_out = (uInt)(*out_len); // output buffer size + } + else{ + infstream.avail_out = (uInt)1000000000; // Assume it is large enough + } + infstream.next_out = (Bytef *)out; // buffer size + + // the actual decompression work. + err = inflateInit(&infstream); + if (err != Z_OK){ + printf("inflateInit fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = inflate(&infstream, Z_FINISH); + if (err != Z_STREAM_END){ + printf("inflate fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = inflateEnd(&infstream); + if (err != Z_OK){ + printf("inflateEnd fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // If buffer not large enough + if (infstream.avail_in > 0){ + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + + // Size of decomrpessed data + if (out_len != NULL){ + *out_len = infstream.total_out; + + char *env_str; + if ((env_str = getenv("PNETCDF_COMPRESS_VERBOSE")) != NULL) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + printf("rank %d (%s at %d) decompress data size %d into size %d\n", + rank,__func__,__LINE__,in_len,*out_len); + } + } + + return NC_NOERR; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_zlib_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int bsize = in_len << 1; // Start by 2 times of the in_len + char *buf; + + buf = (char*)malloc(bsize); + + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + infstream.avail_in = (uInt)(in_len); // input size + infstream.next_in = (Bytef*)in; // input + infstream.avail_out = (uInt)(bsize); // output buffer size + infstream.next_out = (Bytef *)buf; // output buffer + + // Initialize deflat stream + err = inflateInit(&infstream); + if (err != Z_OK){ + printf("inflateInit fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // The actual decompression work + err = Z_OK; + while (err != Z_STREAM_END){ + // Compress data + err = inflate(&infstream, Z_NO_FLUSH | Z_FINISH); + // Check if buffer is lage enough + if (err != Z_STREAM_END){ + // Enlarge buffer + buf = (char*)realloc(buf, bsize << 1); + + // Reset buffer info in stream + infstream.next_out = (Bytef *)(buf + bsize); + infstream.avail_out = bsize; + + // Reocrd new buffer size + bsize <<= 1; + } + } + + // Finalize deflat stream + err = inflateEnd(&infstream); + if (err != Z_OK){ + printf("inflateEnd fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = infstream.total_out; + } + + // Compressed data + *out = buf; + + return NC_NOERR; +} + +static NCCHK_filter ncchk_driver_zlib = { + ncchk_zlib_init, + ncchk_zlib_finalize, + ncchk_zlib_inq_cpsize, + ncchk_zlib_compress, + ncchk_zlib_compress_alloc, + ncchk_zlib_inq_dcsize, + ncchk_zlib_decompress, + ncchk_zlib_decompress_alloc +}; + +NCCHK_filter* ncchk_zlib_inq_driver(void) { + return &ncchk_driver_zlib; +} + diff --git a/src/drivers/ncchunkio/ncchkio_attr.c b/src/drivers/ncchunkio/ncchkio_attr.c new file mode 100644 index 000000000..90cb5196a --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_attr.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_inq_attname() : dispatcher->inq_attname() + * ncmpi_inq_attid() : dispatcher->inq_attid() + * ncmpi_inq_att() : dispatcher->inq_att() + * ncmpi_rename_att() : dispatcher->inq_rename_att() + * ncmpi_copy_att() : dispatcher->inq_copy_att() + * ncmpi_del_att() : dispatcher->inq_del_att() + * ncmpi_get_att() : dispatcher->inq_get_att() + * ncmpi_put_att() : dispatcher->inq_put_arr() + * + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include +#include "ncchkio_internal.h" + +int +ncchkio_inq_attname(void *ncdp, + int varid, + int attid, + char *name) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_attname(ncchkp->ncp, varid, attid, name); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_attid(void *ncdp, + int varid, + const char *name, + int *attidp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_attid(ncchkp->ncp, varid, name, attidp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_att(void *ncdp, + int varid, + const char *name, + nc_type *datatypep, + MPI_Offset *lenp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_att(ncchkp->ncp, varid, name, datatypep, lenp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_rename_att(void *ncdp, + int varid, + const char *name, + const char *newname) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->rename_att(ncchkp->ncp, varid, name, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + + +int +ncchkio_copy_att(void *ncdp_in, + int varid_in, + const char *name, + void *ncdp_out, + int varid_out) +{ + int err=NC_NOERR; + NC_chk *foo_in = (NC_chk*)ncdp_in; + NC_chk *foo_out = (NC_chk*)ncdp_out; + + err = foo_in->driver->copy_att(foo_in->ncp, varid_in, name, + foo_out->ncp, varid_out); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_del_att(void *ncdp, + int varid, + const char *name) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->del_att(ncchkp->ncp, varid, name); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_get_att(void *ncdp, + int varid, + const char *name, + void *buf, + MPI_Datatype itype) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->get_att(ncchkp->ncp, varid, name, buf, itype); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_put_att(void *ncdp, + int varid, + const char *name, + nc_type xtype, + MPI_Offset nelems, + const void *buf, + MPI_Datatype itype) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->put_att(ncchkp->ncp, varid, name, xtype, nelems, buf, + itype); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_dim.c b/src/drivers/ncchunkio/ncchkio_dim.c new file mode 100644 index 000000000..ef5cee4fb --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_dim.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_def_dim() : dispatcher->def_dim() + * ncmpi_inq_dimid() : dispatcher->inq_dimid() + * ncmpi_inq_dim() : dispatcher->inq_dim() + * ncmpi_rename_dim() : dispatcher->rename_dim() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +int +ncchkio_def_dim(void *ncdp, + const char *name, + MPI_Offset size, + int *dimidp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->def_dim(ncchkp->ncp, name, size, dimidp); + if (err != NC_NOERR) return err; + + if (size == NC_UNLIMITED){ + ncchkp->recdim = *dimidp; + } + + return NC_NOERR; +} + +int +ncchkio_inq_dimid(void *ncdp, + const char *name, + int *dimid) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_dimid(ncchkp->ncp, name, dimid); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_dim(void *ncdp, + int dimid, + char *name, + MPI_Offset *sizep) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_dim(ncchkp->ncp, dimid, name, sizep); + if (err != NC_NOERR) return err; + + if (dimid == ncchkp->recdim){ // update # records + if (*sizep < ncchkp->recsize){ + *sizep = ncchkp->recsize; + } + } + + return NC_NOERR; +} + +int +ncchkio_rename_dim(void *ncdp, + int dimid, + const char *newname) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->rename_dim(ncchkp->ncp, dimid, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_driver.c b/src/drivers/ncchunkio/ncchkio_driver.c new file mode 100644 index 000000000..1fda000f5 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_driver.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +static PNC_driver ncchkio_driver = { + /* FILE APIs */ + ncchkio_create, + ncchkio_open, + ncchkio_close, + ncchkio_enddef, + ncchkio__enddef, + ncchkio_redef, + ncchkio_sync, + ncchkio_flush, + ncchkio_abort, + ncchkio_set_fill, + ncchkio_inq, + ncchkio_inq_misc, + ncchkio_sync_numrecs, + ncchkio_begin_indep_data, + ncchkio_end_indep_data, + + /* DIMENSION APIs */ + ncchkio_def_dim, + ncchkio_inq_dimid, + ncchkio_inq_dim, + ncchkio_rename_dim, + + /* ATTRIBUTE APIs */ + ncchkio_inq_att, + ncchkio_inq_attid, + ncchkio_inq_attname, + ncchkio_copy_att, + ncchkio_rename_att, + ncchkio_del_att, + ncchkio_get_att, + ncchkio_put_att, + + /* VARIABLE APIs */ + ncchkio_def_var, + ncchkio_def_var_fill, + ncchkio_fill_var_rec, + ncchkio_inq_var, + ncchkio_inq_varid, + ncchkio_rename_var, + ncchkio_get_var, + ncchkio_put_var, + ncchkio_get_varn, + ncchkio_put_varn, + ncchkio_get_vard, + ncchkio_put_vard, + ncchkio_iget_var, + ncchkio_iput_var, + ncchkio_bput_var, + ncchkio_iget_varn, + ncchkio_iput_varn, + ncchkio_bput_varn, + + ncchkio_buffer_attach, + ncchkio_buffer_detach, + ncchkio_wait, + ncchkio_cancel +}; + +PNC_driver* ncchkio_inq_driver(void) { + return &ncchkio_driver; +} + diff --git a/src/drivers/ncchunkio/ncchkio_driver.h b/src/drivers/ncchunkio/ncchkio_driver.h new file mode 100644 index 000000000..94a728d76 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_driver.h @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifndef _ncchkio_DRIVER_H +#define _ncchkio_DRIVER_H + +#include +#include +#include +#include + +#include "ncchkioi_profile.h" + +#define NC_CHK_VAR_RAW 0 +#define NC_CHK_VAR_COMPRESSED 1 +#define NC_CHK_VAR_DATA 2 +#define NC_CHK_VAR_META 3 + +#define NC_CHK_MAPPING_STATIC 0 +#define NC_CHK_MAPPING_DYNAMIC 01 + +#define NC_CHK_COMM_CHUNK 0 +#define NC_CHK_COMM_PROC 1 + +#define NC_CHK_ 1 + +/* Chunk cache structure */ +typedef struct NC_chk_cache { + char *buf; // Buffer + size_t bsize; // Size in byte + int serial; // batch number to detect swap out of cache allocated in the same batch + struct NC_chk_cache **ref; // Ref to clr when it is swap out + struct NC_chk_cache *prev; + struct NC_chk_cache *next; +} NC_chk_cache; + +/* Get_req structure */ +typedef struct NC_chk_req { + int varid; + int nreq; + MPI_Offset *start; + MPI_Offset **starts; + MPI_Offset *count; + MPI_Offset **counts; + MPI_Offset *stride; + MPI_Offset bufcount; + MPI_Datatype buftype; + char *buf; + char *xbuf; + char **xbufs; +} NC_chk_req; + +/* Get_req list structure */ +typedef struct NC_chk_req_list { + NC_chk_req *reqs; // Array of request object + int *ids; // Array of request ids + int *pos; // Array of position of request ids in ids + int nalloc; // Size of the pool + int nused; // Number of ids issued +} NC_chk_req_list; + +typedef struct NC_chk_var_chunk { + MPI_Offset *start; + MPI_Offset *xdata_offs; + MPI_Offset *xdata_lens; + int owner; + char *data; + char *xdata; +} NC_chk_var_chunk; + +typedef struct NC_chk_chunk_index_entry { + MPI_Offset off; + int len; +} NC_chk_chunk_index_entry; + +typedef struct NC_chk_var { + int varkind; + int isrec; + int isnew; + + nc_type xtype; + MPI_Datatype etype; + int esize; + + int ndim; + MPI_Offset *dimsize; + int *dimids; + + int varid; + + int nchunk; + int nchunkrec; + int nchunkalloc; + int nrec; + int nrecalloc; + int expanded; + int chunksize; + int *nchunks; + int *cidsteps; + int *chunk_owner; + int *chunkdim; + int *dirty; + NC_chk_cache **chunk_cache; + + int nmychunk; + int nmychunkrec; + int *mychunks; + + MPI_Offset metaoff; + NC_chk_chunk_index_entry *chunk_index; + // MPI_Offset *data_offs; + // int *data_lens; + + NCCHK_filter *filter_driver; /* Compression driver */ + int filter; + + int chunk_map_method; +} NC_chk_var; + +typedef struct NC_chk_var_list { + NC_chk_var *data; + int cnt; + int nalloc; +} NC_chk_var_list; + +typedef struct NC_chk NC_chk; /* forward reference */ +struct NC_chk { + int mode; /* file _open/_create mode */ + int flag; /* define/data/collective/indep mode */ + int rank; + int np; + char *path; /* path name */ + MPI_Comm comm; /* MPI communicator */ + void *ncp; /* pointer to driver's internal object */ + struct PNC_driver *driver; + int blockmapping; + MPI_Offset recsize; /* record dim size */ + MPI_Offset recnalloc; /* record dim allocated */ + MPI_Offset default_recnalloc; + int recdim; /* record dim id */ + NC_chk_var_list vars; + NC_chk_req_list putlist, getlist; + int comm_unit; + int delay_init; + int exact_cown; + int max_ndim; + int max_chunk_size; + MPI_Offset nmychunks; // Sum of nmychunk in everyvar + int default_filter; + int nwrite; + MPI_Offset getsize; + MPI_Offset putsize; + size_t cache_limit; + size_t cache_limit_hint; + size_t cache_used; + int cache_serial; + NC_chk_cache *cache_head; + NC_chk_cache *cache_tail; + int ndim; // Number of dim in file + int *chunkdim; // Default chunk dim for each dimension + MPI_Offset cown_size; // Size of all chunks owned + MPI_Datatype overlaptype; + MPI_Op max_cown_op; + MPI_Offset assigned_chunks; + double cown_ratio; + size_t hdr_reserve; // Additional reserve space in the file header +#ifdef PNETCDF_PROFILING + NC_chk_timers profile; + MPI_Offset sendsize; + MPI_Offset recvsize; + MPI_Offset var_size_sum; + MPI_Offset var_zsize_sum; + int nsend; + int nrecv; + int nremote; + int nreq; + int nlocal; +#endif +}; + +extern int ncchkio_create ( + MPI_Comm comm, const char *path, int cmode, int ncid, int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); + +extern int ncchkio_open ( + MPI_Comm comm, const char *path, int omode, int ncid, int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); + +extern int ncchkio_close (void *ncdp); + +extern int ncchkio_enddef (void *ncdp); + +extern int ncchkio__enddef ( + void *ncdp, MPI_Offset h_minfree, MPI_Offset v_align, MPI_Offset v_minfree, MPI_Offset r_align); + +extern int ncchkio_redef (void *ncdp); + +extern int ncchkio_sync (void *ncdp); + +extern int ncchkio_flush (void *ncdp); + +extern int ncchkio_abort (void *ncdp); + +extern int ncchkio_set_fill (void *ncdp, int fill_mode, int *old_fill_mode); + +extern int ncchkio_fill_var_rec (void *ncdp, int varid, MPI_Offset recno); + +extern int ncchkio_inq (void *ncdp, int *ndimsp, int *nvarsp, int *nattsp, int *xtendimp); + +extern int ncchkio_inq_misc (void *ncdp, + int *pathlen, + char *path, + int *num_fix_varsp, + int *num_rec_varsp, + int *striping_size, + int *striping_count, + MPI_Offset *header_size, + MPI_Offset *header_extent, + MPI_Offset *recsize, + MPI_Offset *put_size, + MPI_Offset *get_size, + MPI_Info *info_used, + int *nreqs, + MPI_Offset *usage, + MPI_Offset *buf_size); + +extern int ncchkio_sync_numrecs (void *ncdp); + +extern int ncchkio_begin_indep_data (void *ncdp); + +extern int ncchkio_end_indep_data (void *ncdp); + +extern int ncchkio_def_dim (void *ncdp, const char *name, MPI_Offset size, int *dimidp); + +extern int ncchkio_inq_dimid (void *ncdp, const char *name, int *dimidp); + +extern int ncchkio_inq_dim (void *ncdp, int dimid, char *name, MPI_Offset *lengthp); + +extern int ncchkio_rename_dim (void *ncdp, int dimid, const char *newname); + +extern int ncchkio_inq_att ( + void *ncdp, int varid, const char *name, nc_type *xtypep, MPI_Offset *lenp); + +extern int ncchkio_inq_attid (void *ncdp, int varid, const char *name, int *idp); + +extern int ncchkio_inq_attname (void *ncdp, int varid, int attnum, char *name); + +extern int ncchkio_copy_att ( + void *ncdp_in, int varid_in, const char *name, void *ncdp_out, int varid_out); + +extern int ncchkio_rename_att (void *ncdp, int varid, const char *name, const char *newname); + +extern int ncchkio_del_att (void *ncdp, int varid, const char *name); + +extern int ncchkio_get_att ( + void *ncdp, int varid, const char *name, void *value, MPI_Datatype itype); + +extern int ncchkio_put_att (void *ncdp, + int varid, + const char *name, + nc_type xtype, + MPI_Offset nelems, + const void *value, + MPI_Datatype itype); + +extern int ncchkio_def_var ( + void *ncdp, const char *name, nc_type type, int ndims, const int *dimids, int *varidp); + +extern int ncchkio_def_var_fill (void *ncdp, int varid, int nofill, const void *fill_value); + +extern int ncchkio_inq_var (void *ncdp, + int varid, + char *name, + nc_type *xtypep, + int *ndimsp, + int *dimids, + int *nattsp, + MPI_Offset *offsetp, + int *no_fill, + void *fill_value); + +extern int ncchkio_inq_varid (void *ncdp, const char *name, int *varid); + +extern int ncchkio_rename_var (void *ncdp, int varid, const char *newname); + +extern int ncchkio_get_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_get_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_get_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_iget_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_iput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_bput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_iget_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_iput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_bput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_buffer_attach (void *ncdp, MPI_Offset bufsize); + +extern int ncchkio_buffer_detach (void *ncdp); + +extern int ncchkio_wait (void *ncdp, int num_reqs, int *req_ids, int *statuses, int reqMode); + +extern int ncchkio_cancel (void *ncdp, int num_reqs, int *req_ids, int *statuses); + +#endif diff --git a/src/drivers/ncchunkio/ncchkio_file.c b/src/drivers/ncchunkio/ncchkio_file.c new file mode 100644 index 000000000..604d3fad2 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_file.c @@ -0,0 +1,841 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs + * + * ncmpi_create() : dispatcher->create() + * ncmpi_open() : dispatcher->open() + * ncmpi_close() : dispatcher->close() + * ncmpi_enddef() : dispatcher->enddef() + * ncmpi__enddef() : dispatcher->_enddef() + * ncmpi_redef() : dispatcher->redef() + * ncmpi_begin_indep_data() : dispatcher->begin_indep_data() + * ncmpi_end_indep_data() : dispatcher->end_indep_data() + * ncmpi_abort() : dispatcher->abort() + * ncmpi_inq() : dispatcher->inq() + * ncmpi_inq_misc() : dispatcher->inq_misc() + * ncmpi_wait() : dispatcher->wait() + * ncmpi_wait_all() : dispatcher->wait() + * ncmpi_cancel() : dispatcher->cancel() + * + * ncmpi_set_fill() : dispatcher->set_fill() + * ncmpi_fill_var_rec() : dispatcher->fill_rec() + * ncmpi_def_var_fill() : dispatcher->def_var_fill() + * ncmpi_inq_var_fill() : dispatcher->inq() + * + * ncmpi_sync() : dispatcher->sync() + * ncmpi_flush() : dispatcher->flush() + * ncmpi_sync_numrecs() : dispatcher->sync_numrecs() + * + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include /* strlen() */ + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ +{ + int err=NC_NOERR; + int one = 1; + void *ncp = NULL; + NC_chk *ncchkp; + PNC_driver *driver = NULL; +#ifdef PNETCDF_PROFILING + double t0; + t0 = MPI_Wtime (); +#endif + + /* TODO: use cmode to determine the true driver */ + driver = ncmpio_inq_driver (); + if (driver == NULL) return NC_ENOTNC; + + err = driver->create(comm, path, cmode | NC_64BIT_DATA, ncid, env_mode, info, node_ids, &ncp); + if (err != NC_NOERR) return err; + + /* Create a NC_chk object and save its driver pointer */ + ncchkp = (NC_chk *)NCI_Malloc (sizeof (NC_chk)); + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_ENOMEM) + + ncchkp->path = (char *)NCI_Malloc (strlen (path) + 1); + if (ncchkp->path == NULL) { + NCI_Free (ncchkp); + DEBUG_RETURN_ERROR (NC_ENOMEM) + } + strcpy (ncchkp->path, path); + ncchkp->mode = cmode | NC_WRITE; + ncchkp->driver = driver; + ncchkp->flag = 0; + ncchkp->ncp = ncp; + ncchkp->comm = comm; + MPI_Comm_rank (comm, &(ncchkp->rank)); + MPI_Comm_size (comm, &(ncchkp->np)); + + ncchkioi_init (ncchkp, 1); + + err = ncchkioi_extract_hint (ncchkp, info); + if (err != NC_NOERR) return err; + + err = driver->put_att (ncchkp->ncp, NC_GLOBAL, "_comressed", NC_INT, 1, &one, + MPI_INT); // Mark this file as compressed + if (err != NC_NOERR) return err; + + *ncpp = ncchkp; + + // Timer array is not avaiable until init, can't use NC_CHK_TIMER_START +#ifdef PNETCDF_PROFILING + t0 = MPI_Wtime () - t0; + ncchkp->profile.tt[NC_CHK_TIMER_VAR_INIT] += t0; + ncchkp->profile.tt[NC_CHK_TIMER_TOTAL] += t0; +#endif + + return NC_NOERR; +} + +int ncchkio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ +{ + int err=NC_NOERR; + int one = 0; + void *ncp = NULL; + NC_chk *ncchkp = NULL; + PNC_driver *driver = NULL; +#ifdef PNETCDF_PROFILING + double t0; + + t0 = MPI_Wtime (); +#endif + + /* TODO: use omode to determine the true driver */ + driver = ncmpio_inq_driver (); + if (driver == NULL) { + DEBUG_ASSIGN_ERROR (err, NC_ENOTNC) + goto errout; + } + + err = driver->open(comm, path, omode, ncid, env_mode, info, node_ids, &ncp); + if (err != NC_NOERR) goto errout; + + /* Create a NC_chk object and save its driver pointer */ + ncchkp = (NC_chk *)NCI_Malloc (sizeof (NC_chk)); + if (ncchkp == NULL) { + DEBUG_ASSIGN_ERROR (err, NC_ENOMEM) + goto errout; + } + + ncchkp->path = (char *)NCI_Malloc (strlen (path) + 1); + if (ncchkp->path == NULL) { + NCI_Free (ncchkp); + DEBUG_ASSIGN_ERROR (err, NC_ENOMEM) + goto errout; + } + strcpy (ncchkp->path, path); + ncchkp->mode = omode; + ncchkp->driver = driver; + if (ncchkp->mode & NC_WRITE) { + ncchkp->flag = 0; + } else { + ncchkp->flag |= NC_MODE_RDONLY; + } + ncchkp->ncp = ncp; + ncchkp->comm = comm; + MPI_Comm_rank (comm, &(ncchkp->rank)); + MPI_Comm_size (comm, &(ncchkp->np)); + + ncchkioi_init (ncchkp, 0); + + err = ncchkioi_extract_hint (ncchkp, info); + if (err != NC_NOERR) goto errout; + + err = driver->get_att (ncchkp->ncp, NC_GLOBAL, "_comressed", &one, + MPI_INT); // Mark this file as compressed + if (err != NC_NOERR) { + if (err == NC_ENOTATT) { err = NC_EINVAL; } + goto errout; + } + + // Not compressed file + if (one != 1) { + NCI_Free (ncchkp->path); + NCI_Free (ncchkp); + DEBUG_RETURN_ERROR (NC_EINVAL) + } + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + ncchkioi_parse_var_info (ncchkp); + + *ncpp = ncchkp; + + // Timer array is not avaiable until init, can't use NC_CHK_TIMER_START +#ifdef PNETCDF_PROFILING + t0 = MPI_Wtime () - t0; + ncchkp->profile.tt[NC_CHK_TIMER_VAR_INIT] += t0; + ncchkp->profile.tt[NC_CHK_TIMER_TOTAL] += t0; +#endif + + return NC_NOERR; + +errout: + if (ncp != NULL) { driver->close (ncchkp->ncp); } + if (ncchkp != NULL) { + if (ncchkp->path != NULL) { NCI_Free (ncchkp->path); } + NCI_Free (ncchkp); + } + + return err; +} + +int ncchkio_close (void *ncdp) { + int err=NC_NOERR; +#ifdef PNETCDF_PROFILING + MPI_Offset put_size, get_size; + char *_env_str = getenv ("PNETCDF_SHOW_PERFORMANCE_INFO"); +#endif + NC_chk *ncchkp = (NC_chk *)ncdp; + +#ifdef PNETCDF_PROFILING + if (_env_str != NULL && *_env_str != '0') { ncchkioi_update_statistics (ncchkp); } +#endif + + NC_CHK_TIMER_START (NC_CHK_TIMER_FINALIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_EBADID) + + if (!(ncchkp->flag & NC_MODE_RDONLY)) { + int i; + + NC_CHK_TIMER_START (NC_CHK_TIMER_FINALIZE_META) + + err = ncchkp->driver->redef (ncchkp->ncp); + if (err != NC_NOERR) { return err; } + + // record chunk dim + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (ncchkp->vars.data[i].isnew) { + err = ncchkp->driver->put_att (ncchkp->ncp, ncchkp->vars.data[i].varid, "_chunkdim", + NC_INT, ncchkp->vars.data[i].ndim, + ncchkp->vars.data[i].chunkdim, MPI_INT); + if (err != NC_NOERR) { return err; } + err = + ncchkp->driver->put_att (ncchkp->ncp, ncchkp->vars.data[i].varid, "_filter", + NC_INT, 1, &(ncchkp->vars.data[i].filter), MPI_INT); + if (err != NC_NOERR) { return err; } + } + } + + // Record recsize + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_recsize", NC_INT64, 1, + &(ncchkp->recsize), + MPI_LONG_LONG); // Mark this file as compressed + if (err != NC_NOERR) return err; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_FINALIZE_META) + } + +#ifdef PNETCDF_PROFILING + err = ncchkp->driver->inq_misc (ncchkp->ncp, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, &put_size, &get_size, NULL, NULL, NULL, NULL); + CHK_ERR + ncchkp->putsize += put_size; + ncchkp->getsize += get_size; +#endif + err = ncchkp->driver->close (ncchkp->ncp); + CHK_ERR + + ncchkioi_cache_free (ncchkp); + + err = ncchkioi_var_list_free (&(ncchkp->vars)); + CHK_ERR + + err = ncchkioi_req_list_free (&(ncchkp->putlist)); + CHK_ERR + err = ncchkioi_req_list_free (&(ncchkp->getlist)); + CHK_ERR + + NCI_Free (ncchkp->chunkdim); + + if (ncchkp->overlaptype != MPI_DATATYPE_NULL) { MPI_Type_free (&(ncchkp->overlaptype)); } + if (ncchkp->max_cown_op != MPI_OP_NULL) { MPI_Op_free (&(ncchkp->max_cown_op)); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_FINALIZE) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +#ifdef PNETCDF_PROFILING + if (_env_str != NULL && *_env_str != '0') { + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_PUT_SIZE, + (double)ncchkp->putsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_GET_SIZE, + (double)ncchkp->getsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_SEND_SIZE, + (double)ncchkp->sendsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_RECV_SIZE, + (double)ncchkp->recvsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NSEND, (double)ncchkp->nsend); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NRECV, (double)ncchkp->nrecv); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NREMOTE, (double)ncchkp->nremote); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NREQ, (double)ncchkp->nreq); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NLOCAL, (double)ncchkp->nlocal); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NCHUNK, (double)ncchkp->nmychunks); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_VAR_SIZE, + (double)ncchkp->var_size_sum / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_VAR_ZSIZE, + (double)ncchkp->var_zsize_sum / 1048576.0f); + + ncchkioi_print_profile (ncchkp); + } +#endif + +err_out:; + + NCI_Free (ncchkp->path); + + NCI_Free (ncchkp); + + return err; +} + +int ncchkio_enddef (void *ncdp) { + int err=NC_NOERR, ret; + int i; + MPI_Offset logrecnalloc, drecnalloc; + MPI_Offset rsize; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + drecnalloc = 1; + logrecnalloc = 0; + while (drecnalloc < ncchkp->default_recnalloc) { + logrecnalloc++; + drecnalloc <<= 1; + } + + // Reserve header space + rsize = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + rsize = 0; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->isrec) { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * (ncchkp->default_recnalloc + logrecnalloc + 1); // dims + rsize += ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * + (ncchkp->default_recnalloc + 2 * logrecnalloc); // vars + } else { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * 3; // dims + rsize += + ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * 3; // vars + } + } else { + rsize += ((8 + 16) + 4 + 8 + 4); // Atts + } + } + //rsize *= 2; // 2 times for future expension + // Add additional reserve size + rsize += ncchkp->hdr_reserve; + + err = ncchkp->driver->_enddef (ncchkp->ncp, rsize, 0, 0, 0); + if (err != NC_NOERR) return err; + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + if (!(ncchkp->delay_init)) { + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + lens = NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + fdisps = NCI_Malloc (sizeof (MPI_Aint) * ncchkp->vars.cnt * 2); + mdisps = fdisps + ncchkp->vars.cnt; + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + + ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to big endian + ncchkioi_idx_in_swapn (varp->chunk_index, varp->nchunk + 1); +#endif + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + } + +err_out:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return err; +} + +int ncchkio__enddef (void *ncdp, + MPI_Offset h_minfree, + MPI_Offset v_align, + MPI_Offset v_minfree, + MPI_Offset r_align) { + int err=NC_NOERR, ret; + int i; + MPI_Offset logrecnalloc, drecnalloc; + MPI_Offset rsize; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + drecnalloc = 1; + logrecnalloc = 0; + while (drecnalloc < ncchkp->default_recnalloc) { + logrecnalloc++; + drecnalloc <<= 1; + } + + // Reserve header space + rsize = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + rsize = 0; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->isrec) { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * (ncchkp->default_recnalloc + logrecnalloc + 1); // dims + rsize += ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * + (ncchkp->default_recnalloc + 2 * logrecnalloc); // vars + } else { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * 3; // dims + rsize += + ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * 3; // vars + } + } else { + rsize += ((8 + 16) + 4 + 8 + 4); // Atts + } + } + rsize *= 2; // 2 times for future expension + + err = ncchkp->driver->_enddef (ncchkp->ncp, h_minfree + rsize, v_align, v_minfree, r_align); + if (err != NC_NOERR) return err; + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + if (!(ncchkp->delay_init)) { + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + lens = NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + fdisps = NCI_Malloc (sizeof (MPI_Aint) * ncchkp->vars.cnt * 2); + mdisps = fdisps + ncchkp->vars.cnt; + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + + err = ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to big endian + ncchkioi_idx_in_swapn (varp->chunk_index, varp->nchunk + 1); +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; +} + +int ncchkio_redef (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->redef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_begin_indep_data (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->begin_indep_data (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_end_indep_data (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->end_indep_data (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_abort (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_EBADID) + + err = ncchkp->driver->abort (ncchkp->ncp); + + NCI_Free (ncchkp->path); + NCI_Free (ncchkp); + + return err; +} + +int ncchkio_inq (void *ncdp, int *ndimsp, int *nvarsp, int *nattsp, int *xtendimp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->inq (ncchkp->ncp, ndimsp, NULL, nattsp, xtendimp); + if (err != NC_NOERR) return err; + + if (nvarsp != NULL) { *nvarsp = ncchkp->vars.cnt; } + + return NC_NOERR; +} + +int ncchkio_inq_misc (void *ncdp, + int *pathlen, + char *path, + int *num_fix_varsp, + int *num_rec_varsp, + int *striping_size, + int *striping_count, + MPI_Offset *header_size, + MPI_Offset *header_extent, + MPI_Offset *recsize, + MPI_Offset *put_size, + MPI_Offset *get_size, + MPI_Info *info_used, + int *nreqs, + MPI_Offset *usage, + MPI_Offset *buf_size) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->inq_misc (ncchkp->ncp, pathlen, path, num_fix_varsp, num_rec_varsp, + striping_size, striping_count, header_size, header_extent, + recsize, put_size, get_size, info_used, nreqs, usage, buf_size); + if (err != NC_NOERR) return err; + + if (num_fix_varsp != NULL) { *num_fix_varsp = ncchkp->vars.cnt; } + + if (nreqs != NULL) { *nreqs = ncchkp->putlist.nused + ncchkp->getlist.nused; } + + if (put_size != NULL) { *put_size += ncchkp->putsize; } + + if (get_size != NULL) { *get_size += ncchkp->getsize; } + + return NC_NOERR; +} + +int ncchkio_cancel (void *ncdp, int num_req, int *req_ids, int *statuses) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->cancel (ncchkp->ncp, num_req, req_ids, statuses); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_wait (void *ncdp, int num_reqs, int *req_ids, int *statuses, int reqMode) { + int err = NC_NOERR, status = NC_NOERR; + int i; + int ncom = 0, nraw = 0; + int *rawreqs = NULL, *comreqs = NULL; + int *rawstats = NULL, *comstats = NULL; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + + if (num_reqs < 0) { // NC_REQ_ALL || nreqs == NC_PUT_REQ_ALL || nreqs == NC_GET_REQ_ALL + err = ncchkioi_wait (ncchkp, num_reqs, NULL, NULL, reqMode); + if (status == NC_NOERR) { status = err; } + err = ncchkp->driver->wait (ncchkp->ncp, num_reqs, NULL, NULL, reqMode); + if (status == NC_NOERR) { status = err; } + goto done; + } + + if (num_reqs > 0) { + // Count number of get and put requests + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] != NC_REQ_NULL) nraw++; + // if (req_ids[i] & 1) { nraw++; } + } + + // Allocate buffer + ncom = num_reqs - nraw; + rawreqs = (int *)NCI_Malloc (sizeof (int) * nraw); + CHK_PTR (rawreqs) + comreqs = (int *)NCI_Malloc (sizeof (int) * ncom); + CHK_PTR (comreqs) + + // Build put and get req list + nraw = ncom = 0; + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] & 1) { + rawreqs[nraw++] = req_ids[i] >> 1; + } else { + comreqs[ncom++] = req_ids[i] >> 1; + } + } + } + + if (statuses != NULL) { + rawstats = (int *)NCI_Malloc (sizeof (int) * nraw); + CHK_PTR (rawstats) + comstats = (int *)NCI_Malloc (sizeof (int) * ncom); + CHK_PTR (comstats) + } else { + rawstats = NULL; + comstats = NULL; + } + + if (nraw > 0 || reqMode == NC_REQ_COLL) { + err = ncchkp->driver->wait (ncchkp->ncp, nraw, rawreqs, rawstats, reqMode); + if (status == NC_NOERR) { status = err; } + } + + if (ncom > 0 || reqMode == NC_REQ_COLL) { + err = ncchkioi_wait (ncchkp, ncom, comreqs, comstats, reqMode); + if (status == NC_NOERR) { status = err; } + } + + // Assign stats + if (statuses != NULL) { + nraw = ncom = 0; + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] & 1) { + statuses[i] = rawstats[nraw++]; + } else { + statuses[i] = comstats[ncom++]; + } + } + + NCI_Free (rawstats); + NCI_Free (comstats); + } + + NCI_Free (rawreqs); + NCI_Free (comreqs); + +err_out:; + if (status == NC_NOERR) status = err; +done:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT) + + return status; +} + +int ncchkio_set_fill (void *ncdp, int fill_mode, int *old_fill_mode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->set_fill (ncchkp->ncp, fill_mode, old_fill_mode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_fill_var_rec (void *ncdp, int varid, MPI_Offset recno) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->fill_var_rec (ncchkp->ncp, varid, recno); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_def_var_fill (void *ncdp, int varid, int no_fill, const void *fill_value) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->def_var_fill (ncchkp->ncp, varid, no_fill, fill_value); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_sync_numrecs (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->sync_numrecs (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_sync (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->sync (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_flush (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->flush (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_internal.c b/src/drivers/ncchunkio/ncchkio_internal.c new file mode 100644 index 000000000..9ad02e6e7 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_internal.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_init (NC_chk *ncchkp, int isnew) { + int err=NC_NOERR; + + ncchkp->max_ndim = 0; + ncchkp->max_chunk_size = 0; + ncchkp->getsize = 0; + ncchkp->putsize = 0; + ncchkp->nmychunks = 0; + ncchkp->nwrite = 0; + ncchkp->cache_head = NULL; + ncchkp->cache_tail = NULL; + ncchkp->cache_used = 0; + ncchkp->cache_limit = 0; + ncchkp->cache_serial = 0; + ncchkp->ndim = 0; + ncchkp->chunkdim = NULL; + ncchkp->assigned_chunks = 0; + ncchkp->cown_size = 0; + ncchkp->max_cown_op = MPI_OP_NULL; + ncchkp->overlaptype = MPI_DATATYPE_NULL; + + err = ncchkp->driver->inq (ncchkp->ncp, NULL, NULL, NULL, &(ncchkp->recdim)); + if (err != NC_NOERR) return err; + + if (isnew) { + ncchkp->recsize = 0; + } else { + err = ncchkp->driver->get_att (ncchkp->ncp, NC_GLOBAL, "_recsize", &(ncchkp->recsize), + MPI_LONG_LONG); + CHK_ERR // Mark this file as compressed + } + + /* Initialize var list */ + err = ncchkioi_var_list_init (&(ncchkp->vars)); + if (err != NC_NOERR) return err; + + /* Initialize nonblocking list */ + err = ncchkioi_req_list_init (&(ncchkp->getlist)); + if (err != NC_NOERR) return err; + err = ncchkioi_req_list_init (&(ncchkp->putlist)); + if (err != NC_NOERR) return err; + +#ifdef PNETCDF_PROFILING + memset (&(ncchkp->profile), 0, sizeof (NC_chk_timers)); + ncchkp->sendsize = 0; + ncchkp->recvsize = 0; + ncchkp->nsend = 0; + ncchkp->nrecv = 0; + ncchkp->nremote = 0; + ncchkp->nreq = 0; + ncchkp->nlocal = 0; +#endif + +err_out:; + return err; +} + +int ncchkioi_parse_var_info (NC_chk *ncchkp) { + int err=NC_NOERR, ret; + int vid; + int i; + int nvar; + int varkind; + NC_chk_var *varp; + + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkp->driver->inq (ncchkp->ncp, NULL, &nvar, NULL, &(ncchkp->recdim)); + CHK_ERR + + if (nvar > 0) { + for (vid = 0; vid < nvar; vid++) { + err = ncchkp->driver->get_att (ncchkp->ncp, vid, "_varkind", &varkind, + MPI_INT); // Comressed var? + if (err != NC_NOERR) { continue; } + + if (varkind == NC_CHK_VAR_COMPRESSED || varkind == NC_CHK_VAR_RAW) { + err = ncchkioi_var_list_add (&(ncchkp->vars)); + if (err < 0) return err; + varp = ncchkp->vars.data + err; + + memset (varp, 0, sizeof (NC_chk_var)); + + varp->varid = vid; + varp->varkind = varkind; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_ndim", &(varp->ndim), + MPI_INT); // Original dimensions + if (err != NC_NOERR) return err; + + varp->dimids = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + CHK_PTR (varp->dimids) + varp->dimsize = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + CHK_PTR (varp->dimsize) + + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_dimids", + varp->dimids, MPI_INT); // Dimensiona IDs + if (err != NC_NOERR) return err; + + for (i = 0; i < varp->ndim; i++) { + ncchkp->driver->inq_dim (ncchkp->ncp, varp->dimids[i], NULL, + varp->dimsize + i); + } + if (varp->dimids[0] == ncchkp->recdim) { + varp->isrec = 1; + if (varp->dimsize[0] < ncchkp->recsize) { + varp->dimsize[0] = ncchkp->recsize; + } + } else { + varp->isrec = 0; + } + + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_datatype", + &(varp->xtype), MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + + varp->esize = NC_Type_size (varp->xtype); + varp->etype = ncmpii_nc2mpitype (varp->xtype); + varp->chunkdim = NULL; + } + } + } + + // Collective read index table + if (!(ncchkp->delay_init)) { + lens = NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (lens) + fdisps = NCI_Malloc (sizeof (MPI_Aint) * nvar * 2); + CHK_PTR (fdisps) + mdisps = fdisps + nvar; + + nread = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + varp = ncchkp->vars.data + vid; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + // Init var + err = ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, + "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + varp = ncchkp->vars.data + vid; + } + + NCI_Free (lens); + NCI_Free (fdisps); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkio_internal.h b/src/drivers/ncchunkio/ncchkio_internal.h new file mode 100644 index 000000000..992670b41 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_internal.h @@ -0,0 +1,390 @@ +#ifndef _ncchkio_INTERNAL_H +#define _ncchkio_INTERNAL_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "ncchkio_driver.h" +#ifdef PNETCDF_DEBUG +#include +#endif + +#define NC_CHK_FILTER_NONE 0 +#define NC_CHK_FILTER_DUMMY 1 +#define NC_CHK_FILTER_ZLIB 2 +#define NC_CHK_FILTER_SZ 3 + +#define NC_CHK_DEFAULT_REC_ALLOC 1024 +#define NC_CHK_REC_MULTIPLIER 2 + +#ifdef PNETCDF_DEBUG +#define DEBUG_ABORT \ + { \ + char *_env_str = getenv ("PNETCDF_ABORT_ON_ERR"); \ + if (_env_str != NULL && *_env_str != '0') { abort (); } \ + } +#else +#define DEBUG_ABORT +#endif + +#define RET_ERR(E) \ + { \ + err = E; \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } +#define CHK_ERR \ + if (err != NC_NOERR) { \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_MPIERR \ + if (err != MPI_SUCCESS) { \ + err = ncmpii_error_mpi2nc (err, "MPI"); \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_PTR(P) \ + if (!P) { \ + err = NC_ENOMEM; \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_ERR_WAIT(V0, V1) \ + err = MPI_Wait (V0, V1); \ + CHK_MPIERR + +#define CHK_ERR_ALLREDUCE(V0, V1, V2, V3, V4, V5) \ + err = MPI_Allreduce (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_IALLREDUCE(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Iallreduce (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR + +#define CHK_ERR_REDUCE(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Reduce (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR + +#define CHK_ERR_GATHER(V0, V1, V2, V3, V4, V5, V6, V7) \ + err = MPI_Gather (V0, V1, V2, V3, V4, V5, V6, V7); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_PACK(V0, V1, V2, V3, V4, V5, V6) \ + { \ + assert ((V0) != NULL); \ + assert ((V3) != NULL); \ + err = MPI_Pack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_PACK(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Pack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_UNPACK(V0, V1, V2, V3, V4, V5, V6) \ + { \ + int esize; \ + MPI_Type_size (V5, &esize); \ + if (V1 - *((int *)(V2)) < V4 * esize) { abort (); } \ + err = MPI_Unpack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_UNPACK(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Unpack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_TYPE_COMMIT(V0) \ + err = MPI_Type_commit (V0); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_TYPE_CREATE_SUBARRAY(V0, V1, V2, V3, V4, V5, V6) \ + { \ + int d; \ + for (d = 0; d < V0; d++) { \ + if (V1[d] < V2[d] + V3[d]) { \ + printf ( \ + "Error: Subarray outside array at dim %d. size = %d, ssize = %d, start = " \ + "%d\n", \ + d, V1[d], V2[d], V3[d]); \ + abort (); \ + } \ + if (V2[d] <= 0) { \ + printf ("Error: Subarray size <= 0 at dim %d. ssize = %d\n", d, V2[d]); \ + abort (); \ + } \ + } \ + err = MPI_Type_create_subarray (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_TYPE_CREATE_SUBARRAY(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Type_create_subarray (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_WAITALL(V0, V1, V2) \ + err = MPI_Waitall (V0, V1, V2); \ + CHK_MPIERR +#define CHK_ERR_MPROBE(V0, V1, V2, V3, V4) \ + err = MPI_Mprobe (V0, V1, V2, V3, V4); \ + CHK_MPIERR + +#define CHK_ERR_GET_COUNT(V0, V1, V2) \ + err = MPI_Get_count (V0, V1, V2); \ + CHK_MPIERR + +#define CHK_ERR_IMRECV(V0, V1, V2, V3, V4) \ + err = MPI_Imrecv (V0, V1, V2, V3, V4); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_ISEND(V0, V1, V2, V3, V4, V5, V6) \ + assert (V1 >= 0); \ + err = MPI_Isend (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#else +#define CHK_ERR_ISEND(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Isend (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_IRECV(V0, V1, V2, V3, V4, V5, V6) \ + assert (V1 >= 0); \ + err = MPI_Irecv (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#else +#define CHK_ERR_IRECV(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Irecv (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_SET_VIEW(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_set_view (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_READ_AT_ALL(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_read_at_all (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_WRITE_AT_ALL(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_write_at_all (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ALLOC(V0) \ + if (V0 == NULL) { DEBUG_RETURN_ERROR (NC_ENOMEM) } + +typedef struct NC_chk_vector { + int esize; + int size; + int nalloc; + char *data; +} NC_chk_vector; + +// File +extern int ncchkioi_init (NC_chk *, int); +extern int ncchkioi_parse_var_info (NC_chk *); +extern int ncchkioi_var_list_init (NC_chk_var_list *); +extern int ncchkioi_var_list_free (NC_chk_var_list *); +extern int ncchkioi_var_list_add (NC_chk_var_list *); + +// Util +extern int ncchkioi_extract_hint (NC_chk *, MPI_Info); +extern int ncchkioi_export_hint (NC_chk *, MPI_Info); +extern MPI_Offset NC_Type_size (nc_type); +extern void ncchkioi_sort_file_offset (int, MPI_Aint *, MPI_Aint *, int *); +extern int ncchkioi_update_statistics (NC_chk *); +extern int ncchkioi_get_default_chunk_dim (NC_chk *); +extern int ncchkioi_subarray_off_len (int, int *, int *, int *, MPI_Offset *, int *); +extern void ncchkioi_idx_in_swapn (NC_chk_chunk_index_entry *, MPI_Offset); +#ifdef PNETCDF_PROFILING +extern void ncchkioi_print_profile (NC_chk *); +extern void ncchkioi_profile_add_time (NC_chk *ncchkp, int id, double t); +#endif + +// Misc +typedef struct ncchkioi_chunk_overlap_t { + MPI_Offset osize; + int rank; +} ncchkioi_chunk_overlap_t; +extern int ncchkioi_init_nvar_core_reduce (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts); +extern int ncchkioi_calc_chunk_overlap (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset **starts, + MPI_Offset **counts, + ncchkioi_chunk_overlap_t *ocnt); +extern void ncchkioi_assign_chunk_owner (NC_chk *ncchkp, + NC_chk_var *varp, + ncchkioi_chunk_overlap_t *ocnt); +extern int ncchkioi_sync_ocnt_reduce (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req); +extern void ncchkioi_write_chunk_ocnt (NC_chk *ncchkp, + NC_chk_var *varp, + void *ocnt, + size_t ocnt_size); +extern int ncchkioi_calc_chunk_owner (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioi_calc_chunk_owner_reduce ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts); +extern int ncchkioi_calc_chunk_size (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioiconvert (void *, void *, MPI_Datatype, MPI_Datatype, int); + +// Var +extern int ncchkioi_var_init (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioi_load_var (NC_chk *, NC_chk_var *, int, int *); +extern int ncchkioi_load_var_bg (NC_chk *, NC_chk_var *, int, int *); +extern int ncchkioi_load_nvar (NC_chk *, int, int *, int *, int *); +extern int ncchkioi_load_nvar_bg (NC_chk *, int, int *, int *, int *); +extern int ncchkioi_save_var (NC_chk *, NC_chk_var *); +extern int ncchkioi_save_nvar (NC_chk *, int, int *); +extern void ncchkioi_var_free (NC_chk_var *); +extern int ncchkioi_var_resize (NC_chk *, NC_chk_var *); +extern int ncchkioi_init_nvar (NC_chk *, int, int *, int, int *); +extern int ncchkioi_resize_nvar (NC_chk *, int, int *, int, int *); + +// Cache +extern int ncchkioi_cache_alloc (NC_chk *, MPI_Offset, NC_chk_cache **); +extern void ncchkioi_cache_visit (NC_chk *, NC_chk_cache *); +extern void ncchkioi_cache_free (NC_chk *); + +// Chunks +extern int ncchkioi_chunk_itr_init ( + NC_chk_var *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, int *); +extern int ncchkioi_chunk_itr_next ( + NC_chk_var *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, int *); +extern MPI_Offset get_chunk_overlap ( + NC_chk_var *, MPI_Offset *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, MPI_Offset *); +extern int get_chunk_id (NC_chk_var *, MPI_Offset *); +extern int get_chunk_itr (NC_chk_var *, int, MPI_Offset *); +extern int ncchkioi_chunk_itr_init_ex (NC_chk_var *, + const MPI_Offset *, + const MPI_Offset *, + MPI_Offset *, + int *, + MPI_Offset *, + MPI_Offset *); +extern int ncchkioi_chunk_itr_next_ex (NC_chk_var *, + const MPI_Offset *, + const MPI_Offset *, + MPI_Offset *, + int *, + MPI_Offset *, + MPI_Offset *); + +// Get +// extern int ncchkioi_get_var_old(NC_chk*, NC_chk_var*, MPI_Offset*, MPI_Offset*, MPI_Offset*, +// void*); +extern int ncchkioi_get_var_cb_chunk ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_get_var_cb_proc ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_get_varn ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, const void *); +extern int ncchkioi_get_varn_cb_chunk (NC_chk *, + NC_chk_var *, + int, + MPI_Offset *const *, + MPI_Offset *const *, + MPI_Offset *const *, + void **); +extern int ncchkioi_get_varn_cb_proc ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, void **); +extern int ncchkioi_iget_var (NC_chk *, + int, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + void *, + MPI_Offset, + MPI_Datatype, + int *); +extern int ncchkioi_iget_varn (NC_chk *, + int, + int, + MPI_Offset *const *, + MPI_Offset *const *, + void *, + MPI_Offset, + MPI_Datatype, + int *); +extern int ncchkioi_iget_cb_chunk (NC_chk *, int, int *, int *); +extern int ncchkioi_iget_cb_proc (NC_chk *, int, int *, int *); + +// Put +// extern int ncchkioi_put_var_old(NC_chk*, NC_chk_var*, const MPI_Offset*, const MPI_Offset*, const +// MPI_Offset*, void*); +extern int ncchkioi_put_var ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_var_cb_chunk ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_var_cb_proc ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_varn ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, const void *); +extern int ncchkioi_put_varn_cb_chunk (NC_chk *, + NC_chk_var *, + int, + MPI_Offset *const *, + MPI_Offset *const *, + MPI_Offset *const *, + void **); +extern int ncchkioi_put_varn_cb_proc ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, void **); +extern int ncchkioi_iput_var (NC_chk *, + int, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + const void *, + const void *, + int *); +extern int ncchkioi_iput_varn (NC_chk *, + int, + int, + MPI_Offset *const *, + MPI_Offset *const *, + const void *, + const void *, + int *); +extern int ncchkioi_iput_cb_chunk (NC_chk *, int, int *, int *); +extern int ncchkioi_iput_cb_proc (NC_chk *, int, int *, int *); + +// Nonblocking +extern int ncchkioi_req_list_init (NC_chk_req_list *); +extern int ncchkioi_req_list_free (NC_chk_req_list *); +extern int ncchkioi_req_list_add (NC_chk_req_list *, int *); +extern int ncchkioi_req_list_remove (NC_chk_req_list *, int); +extern int ncchkioi_wait_put_reqs (NC_chk *, int, int *, int *); +extern int ncchkioi_wait_get_reqs (NC_chk *, int, int *, int *); +extern int ncchkioi_wait (NC_chk *, int, int *, int *, int); + +// Vector +extern int ncchkioi_vector_init (NC_chk_vector *, int); +extern int ncchkioi_vector_init_ex (NC_chk_vector *, int, int); +extern void ncchkioi_vector_free (NC_chk_vector *); +extern int ncchkioi_vector_append (NC_chk_vector *, void *); +#endif diff --git a/src/drivers/ncchunkio/ncchkio_var.c b/src/drivers/ncchunkio/ncchkio_var.c new file mode 100644 index 000000000..1e65d6d5a --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_var.c @@ -0,0 +1,1125 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_def_var() : dispatcher->def_var() + * ncmpi_inq_varid() : dispatcher->inq_varid() + * ncmpi_inq_var() : dispatcher->inq_var() + * ncmpi_rename_var() : dispatcher->rename_var() + * + * ncmpi_get_var() : dispatcher->get_var() + * ncmpi_put_var() : dispatcher->put_var() + * ncmpi_get_var_() : dispatcher->get_var() + * ncmpi_put_var_() : dispatcher->put_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + * + * ncmpi_iget_var() : dispatcher->iget_var() + * ncmpi_iput_var() : dispatcher->iput_var() + * ncmpi_iget_var_() : dispatcher->iget_var() + * ncmpi_iput_var_() : dispatcher->iput_var() + * + * ncmpi_buffer_attach() : dispatcher->buffer_attach() + * ncmpi_buffer_detach() : dispatcher->buffer_detach() + * ncmpi_bput_var_() : dispatcher->bput_var() + * + * ncmpi_get_varn_() : dispatcher->get_varn() + * ncmpi_put_varn_() : dispatcher->put_varn() + * + * ncmpi_iget_varn_() : dispatcher->iget_varn() + * ncmpi_iput_varn_() : dispatcher->iput_varn() + * ncmpi_bput_varn_() : dispatcher->bput_varn() + * + * ncmpi_get_vard() : dispatcher->get_vard() + * ncmpi_put_vard() : dispatcher->put_vard() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkio_def_var ( + void *ncdp, const char *name, nc_type xtype, int ndims, const int *dimids, int *varidp) { + int i, err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + err = ncchkioi_var_list_add (&(ncchkp->vars)); + if (err < 0) return err; + *varidp = err; + + varp = ncchkp->vars.data + (*varidp); + + varp->ndim = ndims; + varp->chunkdim = NULL; + varp->chunk_index = NULL; + varp->chunk_owner = NULL; + varp->xtype = xtype; + varp->esize = NC_Type_size (xtype); + varp->etype = ncmpii_nc2mpitype (xtype); + varp->isnew = 1; + varp->expanded = 0; + + if (ndims < 1) { // Do not compress scalar + varp->varkind = NC_CHK_VAR_RAW; + varp->dimsize = NULL; + + err = ncchkp->driver->def_var (ncchkp->ncp, name, xtype, ndims, dimids, &varp->varid); + if (err != NC_NOERR) return err; + + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_varkind", NC_INT, 1, + &(varp->varkind), MPI_INT); // Comressed var? + if (err != NC_NOERR) return err; + } else { + err = ncchkp->driver->def_var (ncchkp->ncp, name, xtype, 0, NULL, + &varp->varid); // Dummy var for attrs + if (err != NC_NOERR) return err; + + varp->varkind = NC_CHK_VAR_COMPRESSED; + varp->dimids = (int *)NCI_Malloc (sizeof (int) * ndims); + memcpy (varp->dimids, dimids, sizeof (int) * ndims); + varp->dimsize = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ndims); + for (i = 0; i < ndims; i++) { + ncchkp->driver->inq_dim (ncchkp->ncp, dimids[i], NULL, varp->dimsize + i); + } + if (varp->dimids[0] == ncchkp->recdim) { + varp->isrec = 1; + } else { + varp->isrec = 0; + } + + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_ndim", NC_INT, 1, &ndims, + MPI_INT); // Original dimensions + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_dimids", NC_INT, ndims, dimids, + MPI_INT); // Dimensiona IDs + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_datatype", NC_INT, 1, &xtype, + MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_varkind", NC_INT, 1, + &(varp->varkind), MPI_INT); // Comressed var? + if (err != NC_NOERR) return err; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_inq_varid (void *ncdp, const char *name, int *varid) { + int i, vid, err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + err = ncchkp->driver->inq_varid (ncchkp->ncp, name, &vid); + if (err != NC_NOERR) return err; + + if (varid != NULL) { + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (ncchkp->vars.data[i].varid == vid) { + *varid = i; + break; + } + } + if (i >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_ENOTVAR) } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_inq_var (void *ncdp, + int varid, + char *name, + nc_type *xtypep, + int *ndimsp, + int *dimids, + int *nattsp, + MPI_Offset *offsetp, + int *no_fillp, + void *fill_valuep) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + + varp = ncchkp->vars.data + varid; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varp->varid, name, xtypep, NULL, NULL, nattsp, + offsetp, no_fillp, fill_valuep); + if (err != NC_NOERR) return err; + + if (ndimsp != NULL) { *ndimsp = varp->ndim; } + + if (dimids != NULL) { memcpy (dimids, varp->dimids, sizeof (int) * varp->ndim); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_rename_var (void *ncdp, int varid, const char *newname) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + err = ncchkp->driver->rename_var (ncchkp->ncp, varp->varid, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_get_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, status = NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->get_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, 1, (MPI_Offset **)&start, (MPI_Offset **)&count); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (varp->isrec && (varp->dimsize[0] < ncchkp->recsize) && + (start[0] + count[0] >= varp->dimsize[0])) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (buftype != varp->etype) { + int i; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + } else { + xbuf = cbuf; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + err = ncchkioi_get_var_cb_chunk (ncchkp, varp, start, count, stride, xbuf); + break; + case NC_CHK_COMM_PROC: + err = ncchkioi_get_var_cb_proc (ncchkp, varp, start, count, stride, xbuf); + break; + } + CHK_ERR + + if (buftype != varp->etype) { + err = ncchkioiconvert (xbuf, cbuf, varp->etype, buftype, nelem); + if (err != NC_NOERR) return err; + } + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + if (status == NC_NOERR) status = err; + return status; /* first error encountered */ +} + +int ncchkio_put_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->put_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, 1, (MPI_Offset **)&start, (MPI_Offset **)&count); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + if (buftype != varp->etype) { + int i; + MPI_Offset nelem; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_put_var (ncchkp, varp, start, count, stride, xbuf); + CHK_ERR + + if (cbuf != buf) NCI_Free (cbuf); + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; /* first error encountered */ +} + +int ncchkio_iget_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IGET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iget_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + ncchkioi_iget_var (ncchkp, varid, start, count, stride, imap, buf, bufcount, buftype, reqid); + if (reqid != NULL) { (*reqid) *= 2; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IGET) + + return NC_NOERR; +} + +int ncchkio_iput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iput_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + if (buftype != varp->etype) { + int i; + MPI_Offset nelem; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_iput_var (ncchkp, varid, start, count, stride, xbuf, buf, reqid); + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + +err_out:; + return err; +} + +int ncchkio_buffer_attach (void *ncdp, MPI_Offset bufsize) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->buffer_attach (ncchkp->ncp, bufsize); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_buffer_detach (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->buffer_detach (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_bput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i; + void *cbuf = (void *)buf; + void *xbuf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->bput_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + + if (buftype != varp->etype) { + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + memcpy (xbuf, cbuf, varp->esize * nelem); + } + + err = ncchkioi_iput_var (ncchkp, varid, start, count, stride, xbuf, buf, reqid); + CHK_ERR + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + +err_out:; + return err; +} +int ncchkio_get_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + int i; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->get_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, num, (MPI_Offset **)starts, (MPI_Offset **)counts); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (varp->isrec && (varp->dimsize[0] < ncchkp->recsize)) { + for (i = 0; i < num; i++) { + if (starts[i][0] + counts[i][0] >= varp->dimsize[0]) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + break; + } + } + } + + if (buftype != varp->etype) { + int j; + MPI_Offset tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + } else { + xbuf = cbuf; + } + + err = ncchkioi_get_varn (ncchkp, varp, num, starts, counts, xbuf); + if (err != NC_NOERR) return err; + + if (buftype != varp->etype) { + err = ncchkioiconvert (xbuf, cbuf, varp->etype, buftype, nelem); + if (err != NC_NOERR) return err; + } + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; +} + +int ncchkio_put_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->put_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, num, (MPI_Offset **)starts, (MPI_Offset **)counts); + CHK_ERR + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + + if (buftype != varp->etype) { + int i, j; + MPI_Offset nelem, tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_put_varn (ncchkp, varp, num, starts, counts, xbuf); + if (err != NC_NOERR) return err; + +err_out:; + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return err; +} + +int ncchkio_iget_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IGET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iget_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + err = ncchkioi_iget_varn (ncchkp, varid, num, starts, counts, buf, bufcount, buftype, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IGET) + + return NC_NOERR; +} + +int ncchkio_iput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->isrec) { + for (i = 0; i < num; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + } + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iput_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (buftype != varp->etype) { + int j; + MPI_Offset nelem, tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_iput_varn (ncchkp, varid, num, starts, counts, xbuf, buf, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + + return NC_NOERR; +} + +int ncchkio_bput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i, j; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem, tmp; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->isrec) { + for (i = 0; i < num; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + } + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->bput_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + + if (buftype != varp->etype) { + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + memcpy (xbuf, cbuf, nelem * varp->esize); + } + + err = ncchkioi_iput_varn (ncchkp, varid, num, starts, counts, xbuf, buf, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + + return NC_NOERR; +} + +int ncchkio_get_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); + + err = ncchkp->driver->get_vard (ncchkp->ncp, varid, filetype, buf, bufcount, buftype, reqMode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_put_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); + + err = ncchkp->driver->put_vard (ncchkp->ncp, varid, filetype, buf, bufcount, buftype, reqMode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_cache.c b/src/drivers/ncchunkio/ncchkioi_cache.c new file mode 100644 index 000000000..89b9f360d --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_cache.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +static int ncchkioi_cache_evict (NC_chk *ncchkp) { + int err=NC_NOERR; + NC_chk_cache *target; + + target = ncchkp->cache_head; + + if (target == NULL || target->serial >= ncchkp->cache_serial) { + printf ("Rank %d: Cache limit exceeded\n", ncchkp->rank); + RET_ERR(NC_ENOMEM) + } + + // Remove from list + ncchkp->cache_head = target->next; + if (ncchkp->cache_tail == target) { ncchkp->cache_tail = NULL; } + + ncchkp->cache_used -= target->bsize; // Return budget + ncchkp->cache_head = target->next; + + *(target->ref) = NULL; // Mark as evicted + NCI_Free (target->buf); + NCI_Free (target); + +err_out:; + return err; +} + +int ncchkioi_cache_alloc (NC_chk *ncchkp, MPI_Offset size, NC_chk_cache **ref) { + int err=NC_NOERR; + NC_chk_cache *target; + + // Evict cached data if no space + if (ncchkp->cache_limit > 0) { + while (ncchkp->cache_used + size > ncchkp->cache_limit) { + err = ncchkioi_cache_evict (ncchkp); + CHK_ERR + } + } + ncchkp->cache_used += size; + + // Prepare cache entry + target = (NC_chk_cache *)NCI_Malloc (sizeof (NC_chk_cache)); + if (target == NULL) { DEBUG_RETURN_ERROR (NC_ENOMEM) } + target->bsize = size; + target->next = NULL; + target->prev = ncchkp->cache_tail; + target->ref = ref; + target->serial = ncchkp->cache_serial; + target->buf = NCI_Malloc (size); +#ifdef PNETCDF_DEBUG + memset (target->buf, 0, size); +#endif + + // Insert to list tail + if (ncchkp->cache_tail != NULL) { + ncchkp->cache_tail->next = target; + } else { + ncchkp->cache_head = target; + } + ncchkp->cache_tail = target; + + // Assign reference + *ref = target; + +err_out:; + return err; +} + +void ncchkioi_cache_visit (NC_chk *ncchkp, NC_chk_cache *target) { + if (target != ncchkp->cache_tail) { + // Remove from list + if (target->prev != NULL) { target->prev->next = target->next; } + if (target->next != NULL) { target->next->prev = target->prev; } + + // Insert to list tail + target->next = NULL; + target->prev = ncchkp->cache_tail; + ncchkp->cache_tail->next = target; + ncchkp->cache_tail = target; + } +} + +void ncchkioi_cache_free (NC_chk *ncchkp) { + NC_chk_cache *pre, *cur; + + cur = ncchkp->cache_head; + while (cur != NULL) { + pre = cur; + cur = cur->next; + NCI_Free (pre->buf); + NCI_Free (pre); + } +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk.c b/src/drivers/ncchunkio/ncchkioi_chunk.c new file mode 100644 index 000000000..10cb45891 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + + +#define min(a,b) (((a)<(b))?(a):(b)) +#define max(a,b) (((a)>(b))?(a):(b)) + +MPI_Offset get_chunk_overlap(NC_chk_var *varp, MPI_Offset* cord, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *ostart, MPI_Offset *ocount){ + int i; + MPI_Offset ret = varp->esize; + + for(i = 0; i < varp->ndim; i++){ + ostart[i] = max(start[i], cord[i]); + ocount[i] = min(start[i] + count[i], cord[i] + varp->chunkdim[i]) - ostart[i]; + if (ocount[i] <= 0){ + ocount[i] = 0; + } + ret *= ocount[i]; + } + + return ret; +} + +int get_chunk_id(NC_chk_var *varp, MPI_Offset *cord){ + int i, ret; + + ret = (int)(cord[0]) / varp->chunkdim[0]; + for(i = 1; i < varp->ndim; i++){ + ret = ret * varp->nchunks[i] + (int)(cord[i]) / varp->chunkdim[i]; + } + + return ret; +} + +int get_chunk_itr(NC_chk_var *varp, int idx, MPI_Offset* cord){ + int i; + + for(i = varp->ndim - 1; i >= 0; i--){ + cord[i] = (idx % varp->nchunks[i]) * varp->chunkdim[i]; + idx /= varp->nchunks[i]; + } + + return 0; +} + +int ncchkioi_chunk_itr_init(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid){ + int i; + + *cid = 0; + for(i = 0; i < varp->ndim; i++){ + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += citr[i] / varp->chunkdim[i] * varp->cidsteps[i]; + } + + return NC_NOERR; +} + +int ncchkioi_chunk_itr_next(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid){ + int i, j; + + i = varp->ndim - 1; + citr[i] += varp->chunkdim[i]; + (*cid)++; + for(; i > 0; i--){ + if (citr[i] >= start[i] + count[i]){ + citr[i - 1] += varp->chunkdim[i - 1]; + j = citr[i]; + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += varp->cidsteps[i - 1] - varp->cidsteps[i] * (j - citr[i]) / varp->chunkdim[i]; + } + else{ + break; + } + } + + if (citr[0] >= start[0] + count[0]){ + return 0; + } + + return 1; +} + +int ncchkioi_chunk_itr_init_ex(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid, MPI_Offset *ostart, MPI_Offset *ocount){ + int i; + + *cid = 0; + for(i = 0; i < varp->ndim; i++){ + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += citr[i] / varp->chunkdim[i] * varp->cidsteps[i]; + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + } + + return NC_NOERR; +} + +int ncchkioi_chunk_itr_next_ex(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid, MPI_Offset *ostart, MPI_Offset *ocount){ + int i, j; + + i = varp->ndim - 1; + citr[i] += varp->chunkdim[i]; + + (*cid)++; + for(; i > 0; i--){ + if (citr[i] >= start[i] + count[i]){ + citr[i - 1] += varp->chunkdim[i - 1]; + ostart[i - 1] += ocount[i - 1]; + ocount[i - 1] = min(varp->chunkdim[i - 1], start[i - 1] + count[i - 1] - ostart[i - 1]); + j = citr[i]; + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + *cid += varp->cidsteps[i - 1] - varp->cidsteps[i] * (j - citr[i]) / varp->chunkdim[i]; + } + else{ + break; + } + } + + if (citr[0] >= start[0] + count[0]){ + return 0; + } + + if (i == varp->ndim - 1){ + ostart[i] += ocount[i]; + ocount[i] = min(varp->chunkdim[i], start[i] + count[i] - ostart[i]); + for (i++; i < varp->ndim; i++) { + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + } + } + + return 1; +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk_owner.c b/src/drivers/ncchunkio/ncchkioi_chunk_owner.c new file mode 100644 index 000000000..7e0db237f --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk_owner.c @@ -0,0 +1,621 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +void ncchkioi_write_chunk_ocnt (NC_chk *ncchkp, NC_chk_var *varp, void *ocnt, size_t ocnt_size) { +#ifdef PNETCDF_PROFILING + { + int i, j; + char *pprefix = getenv ("PNETCDF_OWNER_PREFIX"); + + if (pprefix != NULL) { + if (ncchkp->rank == 0) { + void *ocnt_in; + int *cown; + MPI_Status stat; + FILE *pfile; + char fname[1024], ppath[1024]; + + ocnt_in = NCI_Malloc (ocnt_size * varp->nchunkrec); + cown = NCI_Malloc (sizeof (int) * varp->nchunkrec); + + strcpy (fname, ncchkp->path); + for (i = strlen (fname); i > 0; i--) { + if (fname[i] == '.') { + fname[i] = '\0'; + } else if (fname[i] == '\\' || fname[i] == '/') { + i++; + break; + } + } + sprintf (ppath, "%s%s_owner.csv", pprefix, fname + i); + pfile = fopen (ppath, "a"); + + fprintf (pfile, "Var:, %d\n", varp->varid); + fprintf (pfile, "Rank\\Chunk, "); + for (j = 0; j < varp->nchunkrec; j++) { fprintf (pfile, "%d, ", j); } + fprintf (pfile, "\nOwner, "); + for (j = 0; j < varp->nchunk; j++) { + fprintf (pfile, "%d, ", varp->chunk_owner[j]); + } + fprintf (pfile, "\n0, "); + if (ocnt_size == sizeof (MPI_Offset)) { + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((MPI_Offset *)ocnt)[j]); + } + } else { + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((ncchkioi_chunk_overlap_t *)ocnt)[j].osize); + } + } + fprintf (pfile, "\n"); + for (i = 1; i < ncchkp->np; i++) { + if (ocnt_size == sizeof (MPI_Offset)) { + MPI_Recv (ocnt_in, varp->nchunkrec, MPI_LONG_LONG, i, 0, ncchkp->comm, + &stat); + fprintf (pfile, "%d, ", i); + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((MPI_Offset *)ocnt_in)[j]); + } + } else { + MPI_Recv (ocnt_in, varp->nchunkrec, ncchkp->overlaptype, i, 0, ncchkp->comm, + &stat); + fprintf (pfile, "%d, ", i); + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", + ((ncchkioi_chunk_overlap_t *)ocnt_in)[j].osize); + } + } + fprintf (pfile, "\n"); + + MPI_Recv (cown, varp->nchunkrec, MPI_INT, i, 0, ncchkp->comm, &stat); + for (j = 0; j < varp->nchunkrec; j++) { + if (cown[j] != varp->chunk_owner[j]) { + printf ("Warning: cown[%d][%d] on rank %d = %d, != %d\n", varp->varid, j, + i, cown[j], varp->chunk_owner[j]); + } + } + } + + fclose (pfile); + NCI_Free (ocnt_in); + NCI_Free (cown); + } else { + if (ocnt_size == sizeof (MPI_Offset)) { + MPI_Send (ocnt, varp->nchunkrec, MPI_LONG_LONG, 0, 0, ncchkp->comm); + } else { + MPI_Send (ocnt, varp->nchunkrec, ncchkp->overlaptype, 0, 0, ncchkp->comm); + } + MPI_Send (varp->chunk_owner, varp->nchunkrec, MPI_INT, 0, 0, ncchkp->comm); + } + } + } +#endif +} + +void max_osize_rank_op (void *inp, void *inoutp, int *len, MPI_Datatype *dptr) { + int i; + ncchkioi_chunk_overlap_t *in = (ncchkioi_chunk_overlap_t *)inp; + ncchkioi_chunk_overlap_t *inout = (ncchkioi_chunk_overlap_t *)inoutp; + + for (i = 0; i < *len; i++) { + if (in->osize > inout->osize) { + inout->osize = in->osize; + inout->rank = in->rank; + } else if ((in->osize == inout->osize) && (in->rank < inout->rank)) { + inout->osize = in->osize; + inout->rank = in->rank; + } + in++; + inout++; + } +} + +int ncchkioi_calc_chunk_owner ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + return ncchkioi_calc_chunk_owner_reduce (ncchkp, varp, nreq, starts, counts); +} + +static inline void ncchkioi_rec_chunk_overlap (MPI_Offset *ostart, + MPI_Offset *osize, + MPI_Offset *citr, + NC_chk_var *varp, + MPI_Offset *ocnt, + NC_chk_req *reqp) { + int i; + int req; + int cid; // Chunk iterator + MPI_Offset overlapsize; + + for (req = 0; req < reqp->nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, reqp->starts[req], reqp->counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (cid < varp->nchunkrec) { // Count only first record + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid] += (double)overlapsize; + if (ocnt[cid] > varp->chunksize) { ocnt[cid] = (double)varp->chunksize; } + } + } while (ncchkioi_chunk_itr_next_ex (varp, reqp->starts[req], reqp->counts[req], citr, &cid, + ostart, osize)); + } +} + +int ncchkioi_calc_chunk_overlap (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset **starts, + MPI_Offset **counts, + ncchkioi_chunk_overlap_t *ocnt) { + int err=NC_NOERR; + int i, j, k; + int cid; // Chunk iterator + int req; + MPI_Offset overlapsize; + MPI_Offset *ostart, *osize; + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR (ostart) + osize = ostart + varp->ndim; + citr = osize + varp->ndim; + + memset (ocnt, 0, sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec); + + // Count overlapsize of each request + if (varp->isrec) { + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (cid < varp->nchunkrec) { // Count only first record + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid].osize += (double)overlapsize; + if (ocnt[cid].osize > varp->chunksize) { + ocnt[cid].osize = (double)varp->chunksize; + } + } + } while (ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize)); + } + } else { + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid].osize += overlapsize; + if (ocnt[cid].osize > varp->chunksize) { ocnt[cid].osize = varp->chunksize; } + } while (ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize)); + } + } + + // First 16 bit used as noise + for (i = 0; i < varp->nchunkrec; i++) { + ocnt[i].rank = ncchkp->rank; + ocnt[i].osize *= varp->esize; + ocnt[i].osize <<= 16; + } + + // Noise to break tie + j = (ncchkp->rank - ncchkp->assigned_chunks) % ncchkp->np; + if (j < 0) j += ncchkp->np; + if (j > varp->nchunkrec) { j = varp->nchunkrec; } + k = ncchkp->np - 1; // noise from 0 ~ np-1 + for (i = j; i < varp->nchunkrec; i++) { + ocnt[i].osize += k; + k--; + if (k < 0) { k += ncchkp->np; } + } + for (i = 0; i < j; i++) { + ocnt[i].osize += k; + k--; + if (k < 0) { k += ncchkp->np; } + } + ncchkp->assigned_chunks += varp->nchunk; + +err_out:; + NCI_Free (ostart); + return err; +} + +void ncchkioi_assign_chunk_owner (NC_chk *ncchkp, + NC_chk_var *varp, + ncchkioi_chunk_overlap_t *ocnt) { + int i, j; + for (i = 0; i < varp->nchunkrec; i++) { varp->chunk_owner[i] = ocnt[i].rank; } + if (varp->isrec) { + for (i = varp->nchunkrec; i < varp->nchunk; i += varp->nchunkrec) { + memcpy (varp->chunk_owner + i, varp->chunk_owner, sizeof (int) * varp->nchunkrec); + } + } + + // Build skip list of my own chunks + if (varp->nchunk > 0) { + varp->nmychunkrec = 0; + for (j = 0; j < varp->nchunkrec; j++) { + if (varp->chunk_owner[j] == ncchkp->rank) { varp->nmychunkrec++; } + } + varp->nmychunk = varp->nmychunkrec * varp->nrec; + varp->mychunks = (int *)NCI_Realloc (varp->mychunks, sizeof (int) * varp->nmychunkrec * varp->nrecalloc); + varp->nmychunk = 0; + for (j = 0; j < varp->nchunk; j++) { + if (varp->chunk_owner[j] == ncchkp->rank) { + varp->mychunks[varp->nmychunk++] = j; + if (varp->isnew) { // Only apply to new var, old var will be read when it is + // needed + // varp->chunk_cache[j] = (void*)NCI_Malloc(varp->chunksize); // Allocate + // buffer for blocks we own + // memset(varp->chunk_cache[j], 0 , varp->chunksize); + } + } + } + } else { + varp->nmychunk = varp->nmychunkrec = 0; + varp->mychunks = NULL; + } + + // Update global chunk count + ncchkp->nmychunks += (MPI_Offset) (varp->nmychunk); + ncchkp->cown_size += + (MPI_Offset) ((double)((MPI_Offset) (varp->nmychunk) * (MPI_Offset) (varp->chunksize)) * + ncchkp->cown_ratio); +} + +int ncchkioi_sync_ocnt_reduce (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + int i; + + // Construct MPI type for overlap if not already constructed + if (ncchkp->overlaptype == MPI_DATATYPE_NULL) { + err = MPI_Type_contiguous (sizeof (ncchkioi_chunk_overlap_t), MPI_BYTE, + &(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_commit (&(ncchkp->overlaptype)); + CHK_MPIERR + } + + if (ncchkp->max_cown_op == MPI_OP_NULL) { + err = MPI_Op_create (max_osize_rank_op, 1, &(ncchkp->max_cown_op)); + CHK_MPIERR + } + + // Apply owner penalty + for (i = 0; i < nchunk; i++) { + ocnt[i].osize -= ncchkp->cown_size << 16; // Penality for load ballance, set at 1/16 + } + + if (req) { + CHK_ERR_IALLREDUCE (ocnt, ocnt_all, nchunk, ncchkp->overlaptype, ncchkp->max_cown_op, + ncchkp->comm, req); + } else { + CHK_ERR_ALLREDUCE (ocnt, ocnt_all, nchunk, ncchkp->overlaptype, ncchkp->max_cown_op, + ncchkp->comm); + } + +err_out:; + return err; +} + +int ncchkioi_sync_ocnt_gather (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + MPI_Offset **ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + + // Construct MPI type for overlap if not already constructed + if (ncchkp->overlaptype == MPI_DATATYPE_NULL) { + MPI_Datatype tmptype; + + err = MPI_Type_contiguous (sizeof (MPI_Offset), MPI_BYTE, &tmptype); + CHK_MPIERR + err = MPI_Type_commit (&tmptype); + CHK_MPIERR + err = MPI_Type_create_resized (tmptype, 0, sizeof (ncchkioi_chunk_overlap_t), + &(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_commit (&(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_free (&tmptype); + } + + if (req) { + err = MPI_Igather (ocnt, nchunk, ncchkp->overlaptype, ocnt_all[0], nchunk, MPI_LONG_LONG, 0, + ncchkp->comm, req); + } else { + err = MPI_Gather (ocnt, nchunk, ncchkp->overlaptype, ocnt_all[0], nchunk, MPI_LONG_LONG, 0, + ncchkp->comm); + } + CHK_MPIERR + +err_out:; + return err; +} + +int ncchkioi_sync_ocnt_gather_bcast (NC_chk *ncchkp, + NC_chk_var *varp, + MPI_Offset **ocnt_in, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + int i, j, k; + MPI_Offset *cown_size; + + if (ncchkp->rank == 0) { + cown_size = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->np); + memset (cown_size, 0, sizeof (MPI_Offset) * ncchkp->np); + for (i = 0; i < varp->nchunkrec; i++) { + ocnt_all[i].rank = 0; + ocnt_all[i].osize = ocnt_in[0][i]; + k = 0; + for (j = 1; j < ncchkp->np; j++) { + if (ocnt_in[j][i] - cown_size[j] > ocnt_in[k][i] - cown_size[k]) { k = j; } + } + cown_size[k] += + (MPI_Offset) ((double)(varp->chunksize) * ncchkp->cown_ratio) * varp->nrec; + ocnt_all[i].rank = k; + ocnt_all[i].osize = ocnt_in[i][k]; + } + } + + if (req) { + err = MPI_Ibcast (ocnt_all, varp->nchunkrec, ncchkp->overlaptype, 0, ncchkp->comm, req); + } else { + err = MPI_Bcast (ocnt_all, varp->nchunkrec, ncchkp->overlaptype, 0, ncchkp->comm); + } + CHK_MPIERR + +err_out:; + return err; +} + +int ncchkioi_calc_chunk_owner_reduce ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + ncchkioi_chunk_overlap_t *ocnt, *ocnt_all; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + ocnt = (ncchkioi_chunk_overlap_t *)NCI_Malloc (sizeof (ncchkioi_chunk_overlap_t) * + varp->nchunkrec * 2); + CHK_PTR (ocnt) + ocnt_all = ocnt + varp->nchunkrec; + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, nreq, starts, counts, ocnt); + CHK_ERR + + if (ncchkp->exact_cown) { + // err = ncchkioi_sync_ocnt_gather (ncchkp, varp->nchunkrec, ocnt, ocnt_all, NULL); + // CHK_ERR + RET_ERR (NC_ENOTSUPPORT) + } else { + err = ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt, ocnt_all, NULL); + CHK_ERR + } + + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all); + + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt, sizeof (ncchkioi_chunk_overlap_t)); + + NCI_Free (ocnt); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_COWN) + +err_out:; + return err; +} + +static inline int ncchkioi_reduce_max_csize_n ( + NC_chk *ncchkp, int nvar, NC_chk_var **varps, MPI_Offset **ocnts, int **cowns) { + int err=NC_NOERR; + int i, j, k, v; + int nchunk; + MPI_Offset **ocnts_all[2]; + MPI_Offset *cown_size; + MPI_Offset *ocnt, **ocnt_all; + int *cown; + NC_chk_var *varp; + MPI_Request req; + MPI_Request *bcast_reqs; + MPI_Status stat; + + bcast_reqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nvar); + CHK_PTR (bcast_reqs) + + if (ncchkp->rank == 0) { + // Size owned by each process + cown_size = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->np); + CHK_PTR (cown_size) + memset (cown_size, 0, sizeof (MPI_Offset) * ncchkp->np); + + // Max #chunks across vars + nchunk = 0; + for (v = 0; v < nvar; v++) { + varp = varps[v]; + if (varp->nchunkrec > nchunk) { nchunk = varp->nchunkrec; } + } + // Allocate 2 set of ocnts_all + ocnts_all[0] = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * ncchkp->np * 2); + CHK_PTR (ocnts_all[0]) + ocnts_all[1] = ocnts_all[0] + ncchkp->np; + + ocnts_all[0][0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * nchunk * ncchkp->np * 2); + CHK_PTR (ocnts_all[0][0]) + ocnts_all[1][0] = ocnts_all[0][0] + nchunk * ncchkp->np; + for (i = 1; i < ncchkp->np; i++) { + ocnts_all[0][i] = ocnts_all[0][i - 1] + nchunk; + ocnts_all[1][i] = ocnts_all[1][i - 1] + nchunk; + } + + if (nvar > 0) { + varp = varps[0]; + ocnt = ocnts[0]; + ocnt_all = ocnts_all[0]; + err = MPI_Igather (ocnt, varp->nchunkrec, MPI_LONG_LONG, ocnt_all, varp->nchunkrec, + MPI_LONG_LONG, 0, ncchkp->comm, &req); + CHK_ERR + } + + for (v = 0; v < nvar; v++) { + cown = cowns[v]; + varp = varps[v]; + ocnt = ocnts[v]; + ocnt_all = ocnts_all[v & 1]; + + // Wait for comm + err = MPI_Wait (&req, &stat); + CHK_ERR + + // Post comm for next var + if (v < nvar - 1) { + err = MPI_Igather (ocnts[v + 1], varps[v + 1]->nchunkrec, MPI_LONG_LONG, + ocnts_all[(v + 1) & 1], varps[v + 1]->nchunkrec, MPI_LONG_LONG, + 0, ncchkp->comm, &req); + CHK_ERR + } + + // Compute max rank for this var + memset (cown, 0, sizeof (int) * varp->nchunkrec); + for (i = 0; i < varp->nchunk; i++) { + k = 0; + for (j = 1; j < ncchkp->np; j++) { + if (ocnt_all[j][i] - cown_size[j] > ocnt_all[k][i] - cown_size[k]) { k = j; } + } + cown_size[k] += + (MPI_Offset) ((double)(varp->chunksize) * ncchkp->cown_ratio) * varp->nrec; + cown[i] = k; + } + + // Bcast result + err = MPI_Ibcast (cown, varp->nchunkrec, MPI_INT, 0, ncchkp->comm, bcast_reqs + v); + CHK_ERR + } + } else { + for (v = 0; v < nvar; v++) { + // Send to rank 0 + err = MPI_Gather (ocnts[v], varps[v]->nchunkrec, MPI_LONG_LONG, NULL, + varps[v]->nchunkrec, MPI_LONG_LONG, 0, ncchkp->comm); + CHK_ERR + // Recv result + err = MPI_Ibcast (cowns[v], varps[v]->nchunkrec, MPI_INT, 0, ncchkp->comm, bcast_reqs + v); + CHK_ERR + } + } + + err = MPI_Waitall (nvar, bcast_reqs, MPI_STATUS_IGNORE); + CHK_ERR + + if (ncchkp->rank == 0) { + NCI_Free (cown_size); + NCI_Free (ocnts_all[0][0]); + NCI_Free (ocnts_all[0]); + } + NCI_Free (bcast_reqs); + +err_out:; + return err; +} + +int ncchkioi_calc_chunk_owner_gather ( + NC_chk *ncchkp, int nvar, NC_chk_var **varps, int nput, int *putreqs, int nget, int *getreqs) { + int err=NC_NOERR; + int i, j; + int nchunks; + MPI_Offset *ostart, *osize; + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + MPI_Offset **ocnts; + int **cowns; + NC_chk_var *varp; + int *idmap; + NC_chk_req *reqp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + // Allocate buffer for overlappinp structure + // Box of single overlap + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + osize = ostart + ncchkp->max_ndim; + citr = osize + ncchkp->max_ndim; + // Calculate total number of chunks to assign + idmap = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + nchunks = 0; + for (i = 0; i < nvar; i++) { + idmap[varps[i]->varid] = i; + nchunks += varps[i]->nchunkrec; + } + // Overlap count struct + ocnts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nvar); + ocnts[0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * nchunks); + cowns = (int **)NCI_Malloc (sizeof (int *) * nvar); + cowns[0] = varps[0]->chunk_owner; + for (i = 1; i < nvar; i++) { + ocnts[i] = ocnts[i - 1] + varps[i - 1]->nchunkrec; + cowns[i] = varps[i]->chunk_owner; + } + + // Count overlapsize for each request + memset (ocnts[0], 0, sizeof (ncchkioi_chunk_overlap_t) * nchunks); + for (i = 0; i < nput; i++) { + reqp = ncchkp->putlist.reqs + putreqs[i]; + ncchkioi_rec_chunk_overlap (ostart, osize, citr, ncchkp->vars.data + reqp->varid, + ocnts[idmap[reqp->varid]], reqp); + } + for (i = 0; i < nget; i++) { + reqp = ncchkp->getlist.reqs + getreqs[i]; + ncchkioi_rec_chunk_overlap (ostart, osize, citr, ncchkp->vars.data + reqp->varid, + ocnts[idmap[reqp->varid]], reqp); + } + + // Calculate the max rank + ncchkioi_reduce_max_csize_n (ncchkp, nvar, varps, ocnts, cowns); + + // Copy owner to other records + for (i = 0; i < nvar; i++) { + varp = varps[i]; + if (varp->isrec) { + for (j = varp->nchunkrec; j < varp->nchunk; j += varp->nchunkrec) { + memcpy (varp->chunk_owner + j, varp->chunk_owner, sizeof (int) * varp->nchunkrec); + } + } + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnts[i], sizeof (MPI_Offset)); + } + + NCI_Free (ostart); + NCI_Free (ocnts[0]); + NCI_Free (ocnts); + NCI_Free (cowns); + NCI_Free (idmap); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_COWN) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk_size.c b/src/drivers/ncchunkio/ncchkioi_chunk_size.c new file mode 100644 index 000000000..03b610f5d --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk_size.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +MPI_Offset gcd (MPI_Offset a, MPI_Offset b) { + if (b) { + while ((a %= b) && (b %= a)) + ; + } + return a + b; +} + +void gcd_reduce (long long *in, long long *inout, int *len, MPI_Datatype *dptr) { + int i; + + for (i = 0; i < *len; i++) { + if (*inout) + while (((*in) %= (*inout)) && ((*inout) %= (*in))) + ; + (*inout) = (*inout) + (*in); + in++; + inout++; + } +} + +int smaller (const void *a, const void *b) { return (*(MPI_Offset *)b - *(MPI_Offset *)a); } + +int ncchkioi_calc_chunk_size ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + int r, i, j; + int primes[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, + 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; + MPI_Offset *chunkdim; + MPI_Offset **candidates; + MPI_Offset chunksize; + MPI_Offset ub, lb; + MPI_Op gcd_op; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_CSIZE) + + // Upper and lower bound of reasonable chunk size + ub = (MPI_Offset)INT_MAX; // Max chunk size supported + lb = 1; + for (i = 0; i < varp->ndim; i++) { lb *= varp->dimsize[i]; } + lb /= (MPI_Offset)INT_MAX; // Max # chunks supported + if (lb < varp->ndim * 3) { // Metadata should not exceed data + lb = varp->ndim * 3; + } + if (lb < 1024) { // At least 1 KiB for efficiency + lb = 1024; + } + + /* Infer chunk size by reqs + * Assume the application is doing blocked division + * If we set chunk dim to gcd of all access boundary, no communication required + * If the pattern is completely randomized, the result will likely be 1 + */ + chunkdim = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + if (nreq > 0) { + candidates = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * varp->ndim); + candidates[0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + for (i = 1; i < varp->ndim; i++) { candidates[i] = candidates[i - 1] + nreq; } + for (r = 0; r < nreq; r++) { + for (i = 0; i < varp->ndim; i++) { + candidates[i][r] = gcd (starts[r][i], counts[r][i]); + } + } + for (i = 0; i < varp->ndim; i++) { + qsort (candidates[i], nreq, sizeof (MPI_Offset), smaller); + chunkdim[i] = candidates[i][0]; + for (r = 1; r < nreq / 2; r++) { // Take the top 50% to drop out fragment writes + chunkdim[i] = gcd (chunkdim[i], candidates[i][r]); + } + } + } else { + for (i = 0; i < varp->ndim; i++) { + chunkdim[i] = 0; // We have no clue, listen to other processes + } + } + + // Global gcd + MPI_Op_create ((MPI_User_function *)gcd_reduce, 1, &gcd_op); + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, chunkdim, varp->ndim, MPI_LONG_LONG, gcd_op, ncchkp->comm); + MPI_Op_free (&gcd_op); + + // If we have no clue accross processes, set chunk to max + for (i = 0; i < varp->ndim; i++) { + if (chunkdim[i] == 0) { chunkdim[i] = varp->dimsize[i]; } + } + + // At least 1 for rec dim + if (varp->isrec) { + if (chunkdim[0] == 0) { chunkdim[0] = 1; } + } + + // Check if chunk size is resonable (not too large or too small) + chunksize = 1; + for (i = 0; i < varp->ndim; i++) { chunksize *= chunkdim[i]; } + + // we only support chunk size up to INT_MAX + if (chunksize > ub) { + // Can we find perffect split using small prime numbers? + j = 0; + while ((j < 25) && (chunksize > ub)) { + r = 1; + for (i = 0; i < varp->ndim; i++) { // Spliting chunks along dims + if (chunkdim[i] % primes[j] == 0) { + chunkdim[i] /= primes[j]; + chunksize /= primes[j]; + r = 0; + } + } + if (r) { // No fit, try next prime + j++; + } + } + if (j >= 25) { // If not, we still need to split even we need to introduce communication + // overhead + for (i = 0; chunksize > ub; i++) { // Merging chunks + chunkdim[i % varp->ndim] /= 2; + chunksize /= 2; + } + } + } else if (chunksize < lb) { // Data smaller than metadata + int tmp; + int *heap; + int hsize; + + // Build heap of smallest chunk dim + heap = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + for (i = 0; i < varp->ndim; i++) { + heap[i] = i; + j = i; + r = (j - 1) / 2; + while (j > 0 && chunkdim[heap[j]] < chunkdim[heap[r]]) { + tmp = heap[j]; + heap[j] = heap[r]; + heap[r] = tmp; + j = r; + r = (j - 1) / 2; + } + } + + hsize = varp->ndim; + while (chunksize < lb && hsize > 0) { + j = heap[0]; + if (chunkdim[j] * 2 <= varp->dimsize[j]) { // Merge chunk along smallest dim + chunkdim[j] *= 2; + chunksize *= 2; + } else { // Already reach var dim, remove from consideration + heap[0] = heap[--hsize]; + } + // Heapify + r = 0; + i = r * 2 + 1; + j = r * 2 + 2; + while (i < hsize) { + if ((j >= hsize) || (chunkdim[heap[i]] < chunkdim[heap[j]])) { + if (chunkdim[heap[i]] < chunkdim[heap[r]]) { + tmp = heap[r]; + heap[r] = heap[i]; + heap[i] = tmp; + r = i; + } else { + break; + } + } else { + if (chunkdim[heap[j]] < chunkdim[heap[r]]) { + tmp = heap[r]; + heap[r] = heap[j]; + heap[j] = tmp; + r = j; + } else { + break; + } + } + i = r * 2 + 1; + j = r * 2 + 2; + } + } + NCI_Free (heap); + + // Still not enough after doing everything, just set to entire var + if (chunksize < lb) { + memcpy (chunkdim, varp->dimsize, sizeof (MPI_Offset) * varp->ndim); + + // At least 1 for rec dim + if (varp->isrec) { + if (chunkdim[0] == 0) { chunkdim[0] = 1; } + } + } + } + + for (i = 0; i < varp->ndim; i++) { varp->chunkdim[i] = (int)chunkdim[i]; } + +err_out:; + + NCI_Free (chunkdim); + if (nreq > 0) { + NCI_Free (candidates[0]); + NCI_Free (candidates); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_CSIZE) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_convert.c b/src/drivers/ncchunkio/ncchkioi_convert.c new file mode 100644 index 000000000..d7bb304bb --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_convert.c @@ -0,0 +1,939 @@ +/* Do not edit this file. It is produced from the corresponding .m4 source */ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include + +int ncchkioiconvert(void *inbuf, void *outbuf, MPI_Datatype intype, MPI_Datatype outtype, int N) { + int i; + + if (intype == MPI_BYTE){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_SIGNED_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_SHORT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_SHORT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_INT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_FLOAT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_DOUBLE){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_LONG_LONG_INT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_LONG_LONG){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + + return NC_NOERR; +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_convert.m4 b/src/drivers/ncchunkio/ncchkioi_convert.m4 new file mode 100644 index 000000000..8f2fd9bad --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_convert.m4 @@ -0,0 +1,79 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')dnl +include(`utils.m4')dnl +dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')dnl +dnl +define(`SWOUT',dnl +`dnl + if (outtype == $1) { + for(i = 0; i < N; i++){ + (($2*)outbuf)[i] = ($2)((($3*)inbuf)[i]); + } + return NC_NOERR; + } +')dnl +dnl +define(`SWIN',dnl +`dnl + if (intype == $1){ + +foreach(`dt', (`(`MPI_BYTE', `char')', dnl + `(`MPI_CHAR', `char')', dnl + `(`MPI_SIGNED_CHAR', `signed char')', dnl + `(`MPI_UNSIGNED_CHAR', `unsigned char')', dnl + `(`MPI_SHORT', `short')', dnl + `(`MPI_UNSIGNED_SHORT', `unsigned short')', dnl + `(`MPI_INT', `int')', dnl + `(`MPI_UNSIGNED', `unsigned int')', dnl + `(`MPI_FLOAT', `float')', dnl + `(`MPI_DOUBLE', `double')', dnl + `(`MPI_LONG_LONG_INT', `long long')', dnl + `(`MPI_UNSIGNED_LONG_LONG', `unsigned long long')', dnl + ), `SWOUT(translit(dt, `()'), $2)')dnl + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } +')dnl +dnl +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include + +int ncchkioiconvert(void *inbuf, void *outbuf, MPI_Datatype intype, MPI_Datatype outtype, int N) { + int i; + +foreach(`dt', (`(`MPI_BYTE', `char')', dnl + `(`MPI_CHAR', `char')', dnl + `(`MPI_SIGNED_CHAR', `signed char')', dnl + `(`MPI_UNSIGNED_CHAR', `unsigned char')', dnl + `(`MPI_SHORT', `short')', dnl + `(`MPI_UNSIGNED_SHORT', `unsigned short')', dnl + `(`MPI_INT', `int')', dnl + `(`MPI_UNSIGNED', `unsigned int')', dnl + `(`MPI_FLOAT', `float')', dnl + `(`MPI_DOUBLE', `double')', dnl + `(`MPI_LONG_LONG_INT', `long long')', dnl + `(`MPI_UNSIGNED_LONG_LONG', `unsigned long long')', dnl + ), `SWIN(translit(dt, `()'))')dnl + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + + return NC_NOERR; +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_get_var.c b/src/drivers/ncchunkio/ncchkioi_get_var.c new file mode 100644 index 000000000..2b4db288b --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_get_var.c @@ -0,0 +1,878 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_get_var_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid; // Chunk iterator + + MPI_Offset *ostart = NULL, *osize; + int *tsize = NULL, *tssize, *tstart, *tsizep, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local = NULL, *rcnt_all = NULL; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *cbuf = NULL; // Intermediate continuous buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids = NULL; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs = NULL, *rreqs = NULL; // Send and recv req + MPI_Status *sstats = NULL, *rstats = NULL; // Send and recv status + char **sbufs = NULL, **rbufs = NULL; // Send and recv buffer + int *rsizes = NULL; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + rcnt_all = rcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); + do { + rcnt_local[cid] = 1; + + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + nsend++; + } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + rids = (int *)NCI_Malloc (sizeof (int) * varp->nmychunk); + nread = 0; + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += rcnt_all[cid] - rcnt_local[cid]; + // Count number of chunks we need to prepare + // We read only chunks that is required + + if (rcnt_all[cid] || rcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + // Allocate buffer for send and recv + // We need to accept nrecv requests and receive nsend of replies + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + CHK_PTR (rreqs) + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + CHK_PTR (rstats) + rbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + CHK_PTR (rbufs) + rsizes = (int *)NCI_Malloc (sizeof (int) * (nrecv + nsend)); + CHK_PTR (rsizes) + // We need to send nsend requests and reply nrecv of requests + sbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + CHK_PTR (sbufs) + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + CHK_PTR (sreqs) + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + CHK_PTR (sstats) + + // Post send + k = 0; + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, osize); + // Iterate through chunks + do { + // We got something to send if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Calculate chunk overlap + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + // Allocate buffer + sbufs[k] = (char *)NCI_Malloc (sizeof (int) * varp->ndim * 2); // For request + CHK_PTR (sbufs[k]) + rbufs[k + nrecv] = + (char *)NCI_Malloc (overlapsize); // For reply, first nrecv are for request + CHK_PTR (rbufs[k + nrecv]) + + // Metadata + tstartp = (int *)sbufs[k]; + packoff = varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[k] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send request + CHK_ERR_ISEND (sbufs[k], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, ncchkp->comm, + sreqs + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post recv reply + CHK_ERR_IRECV (rbufs[k + nrecv], overlapsize, MPI_BYTE, varp->chunk_owner[cid], + cid + 1024, ncchkp->comm, rreqs + nrecv + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + + k++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We are the owner of the chunk + // Receive data from other process + for (j = 0; j < rcnt_all[cid] - rcnt_local[cid]; j++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + k); + + // Allocate buffer + rbufs[k] = (char *)NCI_Malloc (rsizes[k]); + CHK_PTR (rbufs[k]) + + // Post irecv + CHK_ERR_IMRECV (rbufs[k], rsizes[k], MPI_BYTE, &rmsg, rreqs + k); + k++; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Allocate intermediate buffer + cbuf = (char *)NCI_Malloc (varp->chunksize); + CHK_PTR (cbuf) + + // For each chunk we own, we need to receive incoming data + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data first if we have any + if (rcnt_local[cid] > 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + get_chunk_overlap (varp, citr, start, count, ostart, osize); + + // Pack type from chunk buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, cbuf, varp->chunksize, &packoff, + ncchkp->comm); + overlapsize = packoff; + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into user buffer + packoff = 0; + CHK_ERR_UNPACK (cbuf, overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in rcnt_local[cid] + // printf("Rank: %d, CHK_ERR_WAITALL_recv(%d, %d)\n", ncchkp->rank, rcnt_all[cid] - + // rcnt_local[cid], k); fflush(stdout); + CHK_ERR_WAITALL (rcnt_all[cid] - rcnt_local[cid], rreqs + k, rstats + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Now, it is time to process data from other processes + for (j = 0; j < varp->ndim; j++) { tsize[j] = varp->chunkdim[j]; } + + // Process data received + // printf("nrecv = %d, rcnt_all = %d, rcnt_local = %d\n", nrecv, rcnt_all[cid], + // rcnt_local[cid]); fflush(stdout); + for (j = k; j < k + rcnt_all[cid] - rcnt_local[cid]; j++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + tstartp = (int *)rbufs[j]; + packoff = varp->ndim * sizeof (int); + tssizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Allocate buffer + MPI_Type_size (ptype, &overlapsize); + sbufs[j + nsend] = (char *)NCI_Malloc (overlapsize); // For reply + + // Data + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbufs[j + nsend], overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Send reply + CHK_ERR_ISEND (sbufs[j + nsend], packoff, MPI_BYTE, rstats[j].MPI_SOURCE, cid + 1024, + ncchkp->comm, sreqs + j + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + } + k += rcnt_all[cid] - rcnt_local[cid]; + + // princbuf(ncchkp->rank, varp->chunk_cache[cid], varp->chunksize); + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all send request + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nsend, 0); fflush(stdout); + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Receive replies from the owners and update the user buffer + k = 0; + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, osize); + // Iterate through chunks + do { + // We got something to recv if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + // Pack type from recv buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + // printf("Rank: %d, ostart=[%lld, %lld], osize=[%lld, %lld]\n", ncchkp->rank, + // ostart[0], ostart[1], osize[0], osize[1]); fflush(stdout); printf("Rank: %d, + // CHK_ERR_TYPE_CREATE_SUBARRAY4([%d, %d], [%d, %d], [%d, %d]\n", ncchkp->rank, + // tsize[0], tsize[1], tssize[0], tssize[1], tstart[0], tstart[1]); fflush(stdout); + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + // printf("Rank: %d, commit\n", ncchkp->rank); fflush(stdout); + CHK_ERR_TYPE_COMMIT (&ptype); + MPI_Type_size (ptype, &overlapsize); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // printf("Rank: %d, wait recv, nrecv = %d, k = %d, nsend = %d\n", ncchkp->rank, nrecv, + // k, nsend); fflush(stdout); + // Wait for reply + // printf("Rank: %d, MPI_Wait_recv(%d)\n", ncchkp->rank, nrecv + k); fflush(stdout); + MPI_Wait (rreqs + nrecv + k, rstats + nrecv + k); + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_CB_RECV_REP, NC_CHK_TIMER_GET_CB_UNPACK_REP) + + // Pack data + // printf("Rank: %d, pack\n", ncchkp->rank); fflush(stdout); + packoff = 0; + CHK_ERR_UNPACK (rbufs[nrecv + k], overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + k++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // printf("Rank: %d, wait_final\n", ncchkp->rank); fflush(stdout); + // Wait for all send replies + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nrecv, nsend); + // fflush(stdout); + CHK_ERR_WAITALL (nrecv, sreqs + nsend, sstats + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + // printf("Rank: %d, exiting\n", ncchkp->rank); fflush(stdout); + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbufs[i]); + NCI_Free (rbufs[i]); + } + NCI_Free (sreqs); + NCI_Free (sstats); + NCI_Free (sbufs); + NCI_Free (rreqs); + NCI_Free (rstats); + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (cbuf != NULL) { NCI_Free (cbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + return err; +} + +int ncchkioi_get_var_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + + MPI_Offset *ostart= NULL , *osize; + int *tsize = NULL, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local = NULL, *rcnt_all; // Number of processes that writes to each proc + + int rrange_local[2], rrange_all[2]; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids = NULL; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat = NULL, rstat, *sstat_re; // Send and recv status + char **sbuf = NULL, **rbuf, **sbufp, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize = NULL, *rsize_re, *ssize_re; // recv size of each message + int *sdst = NULL; // recv size of each message + int *smap = NULL; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * (ncchkp->np * 2 + varp->nchunk * 1)); + CHK_PTR(rcnt_local) + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + CHK_PTR(tsize) + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR(ostart) + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * (ncchkp->np + varp->nchunk)); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + rrange_local[0] = varp->nchunk; + rrange_local[1] = 0; + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (rrange_local[0] > cid) { rrange_local[0] = cid; } + if (rrange_local[1] < cid) { rrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk and access range + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + + rrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (rrange_local, rrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + rrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (2 * nsend + nrecv)); + CHK_PTR (sbuf) + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + CHK_PTR (ssize) + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + CHK_PTR (rbuf) + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + CHK_PTR (rreq) + + sbuf_re = sbuf + nsend; + sbufp = sbuf_re + nrecv; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rbufp = rbuf_re + nsend; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 1); + rsize_re[j] += overlapsize; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + CHK_PTR (sbuf[i]) + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + CHK_PTR (rbuf_re[i]) + } + + // Pack requests + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them from file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + // printf("checking chunk %d, size is %d\n",cid, varp->chunk_index[cid].len); + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; /* printf("chunk %d need read\n",cid); */ } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + CHK_PTR (sbuf_re[j]) + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + for (i = 0; i < varp->ndim; i++) { tsize[i] = count[i]; } + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + // Bring back the request + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { tstartp[k] += (int)(citr[k] - start[k]); } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_get_varn.c b/src/drivers/ncchunkio/ncchkioi_get_varn.c new file mode 100644 index 000000000..40ae859ae --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_get_varn.c @@ -0,0 +1,942 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_get_varn_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + MPI_Offset *const *strides, + void **bufs) { + int err=NC_NOERR; + int i, j, k, l; + int cid, req; // Chunk iterator + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tsizep, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + int overlapcnt; + char *cbuf = NULL; // Intermediate continuous buffer + + int packoff, unpackoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + rcnt_all = rcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + for (req = 0; req < nreq; req++) { + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, &cid); + do { + if (varp->chunk_owner[cid] != ncchkp->rank && rcnt_local[cid] == 0) { + // Count number of mnessage we need to send + nsend++; + } + + rcnt_local[cid] = 1; + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + rids = (int *)NCI_Malloc (sizeof (int) * varp->nmychunk); + nread = 0; + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += rcnt_all[cid] - rcnt_local[cid]; + // Count number of chunks we need to prepare + // We read only chunks that is required + if (rcnt_all[cid] || rcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + // Allocate buffer for send and recv + // We need to accept nrecv requests and receive nsend of replies + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + rbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + rsizes = (int *)NCI_Malloc (sizeof (int) * (nrecv + nsend)); + // We need to send nsend requests and reply nrecv of requests + sbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + + // Post send + k = l = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (varp->chunk_owner[cid] == ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // We are the owner of the chunk + // Receive data from other process + for (j = 0; j < rcnt_all[cid] - rcnt_local[cid]; j++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + k); + + // Allocate buffer + rbufs[k] = (char *)NCI_Malloc (rsizes[k]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[k], rsizes[k], MPI_BYTE, &rmsg, rreqs + k); + k++; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + } else { + // We have some request to send + if (rcnt_local[cid] > 0) { + get_chunk_itr (varp, cid, citr); + rsizes[nrecv + l] = overlapcnt = 0; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Calculate send buffer size + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + rsizes[nrecv + l] += overlapsize; + + if (overlapsize > 0) { overlapcnt++; } + } + + // Allocate buffer + // Faster to request the entire chunk + if (rsizes[nrecv + l] >= varp->chunksize) { + rsizes[nrecv + l] = varp->chunksize; + overlapcnt = 1; + } + sbufs[l] = (char *)NCI_Malloc (sizeof (int) * (overlapcnt * varp->ndim * 2) + 1); + rbufs[nrecv + l] = (char *)NCI_Malloc (rsizes[nrecv + l]); + + // Metadata + *((int *)sbufs[l]) = rsizes[nrecv + l]; + packoff = sizeof (int); + if (rsizes[nrecv + l] == + varp->chunksize) { // Request the entire chunk directly if need more than that + tstartp = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + memset (tstartp, 0, sizeof (int) * varp->ndim); + memcpy (tsizep, varp->chunkdim, sizeof (int) * varp->ndim); + } else { + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + tstartp = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + // Metadata + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send request + CHK_ERR_ISEND (sbufs[l], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + l); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // printf("Rank: %d, CHK_ERR_IRECV(%d, %d, %d, %d)\n", ncchkp->rank, overlapsize, + // varp->chunk_owner[cid], cid + 1024, nrecv + k); fflush(stdout); + CHK_ERR_IRECV (rbufs[l + nrecv], rsizes[nrecv + l], MPI_BYTE, + varp->chunk_owner[cid], cid + 1024, ncchkp->comm, rreqs + nrecv + l); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + + l++; + } + } + } + + // Allocate intermediate buffer + cbuf = (char *)NCI_Malloc (varp->chunksize); + + // For each chunk we own, we need to reply to incoming reqeust + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data first if we have any + if (rcnt_local[cid] > 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + for (req = 0; req < nreq; req++) { + // Calculate overlapping region + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + // Pack type from chunk buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, cbuf, varp->chunksize, + &packoff, ncchkp->comm); + overlapsize = packoff; + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into user buffer + packoff = 0; + CHK_ERR_UNPACK (cbuf, overlapsize, &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in rcnt_local[cid] + CHK_ERR_WAITALL (rcnt_all[cid] - rcnt_local[cid], rreqs + k, rstats + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Now, it is time to process data from other processes + for (j = 0; j < varp->ndim; j++) { tsize[j] = varp->chunkdim[j]; } + // Process data received + for (j = k; j < k + rcnt_all[cid] - rcnt_local[cid]; j++) { + packoff = 0; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Allocate buffer + overlapsize = *((int *)rbufs[j]); + unpackoff = sizeof (int); + sbufs[j + nsend] = (char *)NCI_Malloc (overlapsize); // For reply + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Pack data + while (unpackoff < rsizes[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Get metadata + tstartp = (int *)(rbufs[j] + unpackoff); + unpackoff += varp->ndim * sizeof (int); + tssizep = (int *)(rbufs[j] + unpackoff); + unpackoff += varp->ndim * sizeof (int); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbufs[j + nsend], overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Send reply + // printf("Rank: %d, CHK_ERR_ISEND(%d, %d, %d, %d)\n", ncchkp->rank, packoff, + // varp->chunk_owner[cid], cid + 1024, k + nsend); fflush(stdout); + CHK_ERR_ISEND (sbufs[j + nsend], packoff, MPI_BYTE, rstats[j].MPI_SOURCE, cid + 1024, + ncchkp->comm, sreqs + j + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + } + k += rcnt_all[cid] - rcnt_local[cid]; + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request sent + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nsend, 0); fflush(stdout); + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Receive replies from the owners and update the user buffer + k = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (rcnt_local[cid] > 0 && varp->chunk_owner[cid] != ncchkp->rank) { + get_chunk_itr (varp, cid, citr); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Wait for reply + // printf("Rank: %d, MPI_Wait_recv(%d)\n", ncchkp->rank, nrecv + k); fflush(stdout); + MPI_Wait (rreqs + nrecv + k, rstats + nrecv + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + packoff = 0; + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + // Pack type from recv buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbufs[nrecv + k], rsizes[nrecv + k], &packoff, bufs[req], 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + k++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all send replies + CHK_ERR_WAITALL (nrecv, sreqs + nsend, sstats + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbufs[i]); + NCI_Free (rbufs[i]); + } + NCI_Free (sreqs); + NCI_Free (sstats); + NCI_Free (sbufs); + NCI_Free (rreqs); + NCI_Free (rstats); + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (cbuf != NULL) { NCI_Free (cbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} + +int ncchkioi_get_varn_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void **bufs) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + int req, **reqs; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each proc + + int rrange_local[2], rrange_all[2]; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat, rstat, *sstat_re; // Send and recv status + char **sbuf, **rbuf, **sbufp, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize, *rsize_re, *ssize_re; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * (ncchkp->np * 2 + varp->nchunk * 1)); + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * (ncchkp->np + varp->nchunk)); + nsend = 0; + + // counts[req] total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (rrange_local[0] > cid) { rrange_local[0] = cid; } + if (rrange_local[1] < cid) { rrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + + rrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (rrange_local, rrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + rrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + ncchkioi_load_var (ncchkp, varp, nread, rids); + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend * 2 + nrecv)); + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + reqs = (int **)NCI_Malloc (sizeof (int *) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + + sbuf_re = sbuf + nsend; + sbufp = sbuf_re + nrecv; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rbufp = rbuf_re + nsend; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // counts[req] size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + memset (rcnt_local, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // counts[req] overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 1); + rsize_re[j] += overlapsize; + rcnt_local[j]++; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + reqs[i] = (int *)NCI_Malloc (sizeof (int) * rcnt_local[i]); + } + + // Pack requests + memset (rcnt_local, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Record source of the request + reqs[j][rcnt_local[j]++] = req; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + // Bring up the request + req = reqs[j][rcnt_local[j]++]; + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { + tstartp[k] += (int)(citr[k] - starts[req][k]); + tsize[k] = counts[req][k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (reqs[i]); } + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + NCI_Free (reqs); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} + +int ncchkioi_get_varn (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf) { + int i, j; + MPI_Offset rsize; + char *bptr = (char *)buf; + char **bufs; + + // Calculate buffer offset of each request + bufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + for (i = 0; i < nreq; i++) { + bufs[i] = bptr; + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + bptr += rsize; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + ncchkioi_get_varn_cb_chunk (ncchkp, varp, nreq, starts, counts, NULL, (void **)bufs); + break; + case NC_CHK_COMM_PROC: + ncchkioi_get_varn_cb_proc (ncchkp, varp, nreq, starts, counts, (void **)bufs); + break; + } + NCI_Free (bufs); + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iget.c b/src/drivers/ncchunkio/ncchkioi_iget.c new file mode 100644 index 000000000..148485199 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iget.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +static inline int +ncchkioi_init_get_req( NC_chk *ncchkp, + NC_chk_req *req, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype) { + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset(req, 0, sizeof(NC_chk_req)); + + // Record request + req->starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*)); + req->start = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + req->starts[0] = req->start; + memcpy(req->start, start, sizeof(MPI_Offset) * varp->ndim); + req->counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*)); + req->count = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + req->counts[0] = req->count; + memcpy(req->count, count, sizeof(MPI_Offset) * varp->ndim); + if (stride != NULL){ + req->stride = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + memcpy(req->stride, stride, sizeof(MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void*)buf; + req->nreq = 1; + req->buftype = buftype; + if (varp->etype != buftype){ + if (bufcount > 0){ + req->bufcount = bufcount; + } + else{ + int i; + + req->bufcount = 1; + for(i = 0; i < varp->ndim; i++){ + req->bufcount *= count[i]; + } + } + + req->xbuf = (char*)NCI_Malloc(req->bufcount * varp->esize); + } + else{ + req->xbuf = req->buf; + } + + req->xbufs = (char**)NCI_Malloc(sizeof(char*)); + req->xbufs[0] = req->xbuf; + + return NC_NOERR; +} + +int +ncchkioi_iget_var(NC_chk *ncchkp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid) +{ + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + // Init request + err = ncchkioi_init_get_req(ncchkp, &req, varid, start, count, stride, imap, buf, bufcount, buftype); + + // Add to req list + ncchkioi_req_list_add(&(ncchkp->getlist), &req_id); + ncchkp->getlist.reqs[req_id] = req; + + if (reqid != NULL){ + *reqid = req_id * 2; + } + + return err; +} + +static inline int +ncchkioi_init_get_varn_req( NC_chk *ncchkp, + NC_chk_req *req, + int varid, + int nreq, + MPI_Offset *const*starts, + MPI_Offset *const*counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype) { + int i, j; + MPI_Offset rsize, boff; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset(req, 0, sizeof(NC_chk_req)); + + // Record request + req->starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nreq); + req->start = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim * nreq); + for(i = 0; i < nreq; i++){ + req->starts[i] = req->start + i * varp->ndim; + memcpy(req->starts[i], starts[i], sizeof(MPI_Offset) * varp->ndim); + } + req->counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nreq); + req->count = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim * nreq); + for(i = 0; i < nreq; i++){ + req->counts[i] = req->count + i * varp->ndim; + memcpy(req->counts[i], counts[i], sizeof(MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void*)buf; + req->xbuf = (void*)buf; + req->nreq = nreq; + req->buftype = buftype; + if (varp->etype != buftype){ + if (bufcount > 0){ + req->bufcount = bufcount; + } + else{ + req->bufcount = 0; + for(i = 0; i < nreq; i++){ + rsize = 1; + for(j = 0; j < varp->ndim; j++){ + rsize *= counts[i][j]; + } + req->bufcount += rsize; + } + } + + req->xbuf = (char*)NCI_Malloc(req->bufcount * varp->esize); + } + else{ + req->xbuf = req->buf; + } + + // Calculate buffer for each individual request + req->xbufs = (char**)NCI_Malloc(sizeof(char*) * nreq); + boff = 0; + for(i = 0; i < nreq; i++){ + req->xbufs[i] = (req->xbuf + boff); + + // Advance pointer by size of the request + rsize = varp->esize; + for(j = 0; j < varp->ndim; j++){ + rsize *= counts[i][j]; + } + boff += rsize; + } + + return NC_NOERR; +} + +int +ncchkioi_iget_varn(NC_chk *ncchkp, + int varid, + int nreq, + MPI_Offset * const*starts, + MPI_Offset * const*counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid) +{ + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + if (nreq > 1){ + err = ncchkioi_init_get_varn_req(ncchkp, &req, varid, nreq, starts, counts, buf, bufcount, buftype); + } + else{ + err = ncchkioi_init_get_req(ncchkp, &req, varid, starts[0], counts[0], NULL, NULL, buf, bufcount, buftype); + } + + // Add to req list + ncchkioi_req_list_add(&(ncchkp->getlist), &req_id); + ncchkp->getlist.reqs[req_id] = req; + + if (reqid != NULL){ + *reqid = req_id * 2; + } + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iget_cb.c b/src/drivers/ncchunkio/ncchkioi_iget_cb.c new file mode 100644 index 000000000..e1aecc978 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iget_cb.c @@ -0,0 +1,683 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iget_cb_chunk (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int i, j; + int nvar; + int vid; // Iterators for variable id + int *varids; + int *nreqs; // Number of reqids in each variable + int *nums; // Number of reqs in each varn + int **vreqids; + int num, maxnum = 0; + MPI_Offset **starts, **counts; + char **bufs; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Count total number of request in per variable for packed varn request + nums = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 2); + nreqs = nums + ncchkp->vars.cnt; + memset (nums, 0, sizeof (int) * ncchkp->vars.cnt); + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + nreqs[req->varid]++; + nums[req->varid] += req->nreq; + } + + /* Allocate a skip list of reqids for each vriable + * At the same time, we find out the number of starts and counts we need to allocate + */ + vreqids = (int **)NCI_Malloc (sizeof (int *) * ncchkp->vars.cnt); + vreqids[0] = (int *)NCI_Malloc (sizeof (int) * nreq); + maxnum = 0; + i = 0; + nvar = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Assign buffer to reqid skip list + vreqids[vid] = vreqids[0] + i; + i += nreqs[vid]; + + // maximum number of starts and counts we need across all variables + if (maxnum < nums[vid]) { maxnum = nums[vid]; } + + // Number of variable that has request to write + nvar++; + } + } + + varids = (int *)NCI_Malloc (sizeof (int) * nvar); + + // Fill up the skip list + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + vreqids[req->varid][nreqs[req->varid]++] = reqids[i]; + } + + // Allocate parameters + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * maxnum * 2); + counts = starts + maxnum; + bufs = (char **)NCI_Malloc (sizeof (char *) * maxnum); + + /* Pack requests variable by variable + */ + nvar = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Fill varid in the skip list + varids[nvar++] = vid; + + // Collect parameters + num = 0; + for (j = 0; j < nreqs[vid]; j++) { + req = ncchkp->getlist.reqs + vreqids[vid][j]; + + if (req->nreq > 1) { + for (i = 0; i < req->nreq; i++) { + starts[num] = req->starts[i]; + counts[num] = req->counts[i]; + bufs[num++] = req->xbufs[i]; + } + } else { + starts[num] = req->start; + counts[num] = req->count; + bufs[num++] = req->xbuf; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_INIT) + + // Perform collective buffering + ncchkioi_get_varn_cb_chunk (ncchkp, ncchkp->vars.data + vid, num, starts, counts, NULL, + (void **)bufs); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + } + } + + // Free buffers + NCI_Free (nums); + + NCI_Free (vreqids[0]); + NCI_Free (vreqids); + + NCI_Free (varids); + + NCI_Free (starts); + NCI_Free (bufs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + + return NC_NOERR; +} + +int ncchkioi_iget_cb_proc (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + int vid; + int r, **reqs; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each proc + + int nread; + int *rlo_local, *rhi_local; + int *rlo_all, *rhi_all; + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + MPI_Offset poff; // Offset of buffer to pack to/ from + int plen; + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat, rstat, *sstat_re; // Send and recv status + char **sbuf, **sbufp, **rbuf, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize, *rsize_re, *ssize_re; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + NC_chk_var *varp; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Intermediate buffer for our own data + tbuf = (char *)NCI_Malloc (ncchkp->max_chunk_size); + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * ncchkp->max_ndim * 3); + tssize = tsize + ncchkp->max_ndim; + tstart = tssize + ncchkp->max_ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + osize = ostart + ncchkp->max_ndim; + + // Chunk iterator + citr = osize + ncchkp->max_ndim; + + // Access range + rlo_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 5); + rhi_local = rlo_local + ncchkp->vars.cnt; + rlo_all = rhi_local + ncchkp->vars.cnt; + rhi_all = rlo_all + ncchkp->vars.cnt; + rids = rhi_all + ncchkp->vars.cnt; + + for (i = 0; i < ncchkp->vars.cnt; i++) { + rlo_local[i] = 2147483647; + rhi_local[i] = -1; + } + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // count total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init (varp, req->starts[r], req->counts[r], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + if (rlo_local[req->varid] > cid) { rlo_local[req->varid] = cid; } + if (rhi_local[req->varid] < cid) { rhi_local[req->varid] = cid; } + } while (ncchkioi_chunk_itr_next (varp, req->starts[r], req->counts[r], citr, &cid)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + +#ifdef PNETCDF_PROFILING + ncchkp->nsend += nrecv + nsend; + ncchkp->nrecv += nrecv + nsend; +#endif + + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_local[i] *= -1; } + CHK_ERR_ALLREDUCE (rlo_local, rlo_all, ncchkp->vars.cnt * 2, MPI_INT, MPI_MIN, ncchkp->comm); + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_all[i] *= -1; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend * 2 + nrecv)); + sbufp = sbuf + (nsend + nrecv); + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + reqs = (int **)NCI_Malloc (sizeof (int *) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + rbufp = rbuf + (nsend + nrecv); + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + + sbuf_re = sbuf + nsend; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // req->counts[r] size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (k = 0; k < varp->ndim; k++) { overlapsize *= osize[k]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 2); + rsize_re[j] += overlapsize; + rcnt_local[j]++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); +#ifdef PNETCDF_DEBUG + assert (ssize[i] >= 0); +#endif + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + reqs[i] = (int *)NCI_Malloc (sizeof (int) * rcnt_local[i] * 2); +#ifdef PNETCDF_PROFILING + ncchkp->sendsize += ssize[i]; + ncchkp->recvsize += rsize_re[i]; +#endif + } + + // Pack requests + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Pack metadata + *((int *)sbufp[j]) = varp->varid; + sbufp[j] += sizeof (int); + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (k = 0; k < varp->ndim; k++) { + tstartp[k] = (int)(ostart[k] - citr[k]); + tssizep[k] = (int)osize[k]; + } + + // Record source of the request + reqs[j][rcnt_local[j]++] = i; + reqs[j][rcnt_local[j]++] = r; + +#ifdef PNETCDF_PROFILING + ncchkp->nremote++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); +#ifdef PNETCDF_PROFILING + ncchkp->recvsize += rsize[i]; +#endif + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (rhi_all[i] >= rlo_all[i]) { + varp = ncchkp->vars.data + i; + rids[nread] = i; + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rlo_all[i]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rhi_all[i]; k++) + ; + rlo_all[nread] = j; + rhi_all[nread++] = k; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_BARR) +#endif + err = ncchkioi_load_nvar (ncchkp, nread, rids, rlo_all, rhi_all); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + err = + ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (tbuf, varp->chunk_cache[cid]->buf + poff, plen); + overlapsize = plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + } + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - req->starts[r][j]); + tsize[j] = (int)req->counts[r][j]; + } + err = + ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (req->xbufs[r] + poff, tbuf, plen); + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, req->xbufs[r], 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + } +#ifdef PNETCDF_PROFILING + ncchkp->nlocal++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); +#ifdef PNETCDF_DEBUG + assert (ssize_re[j] >= 0); +#endif + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Retrieve metadata + vid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + err = ncchkioi_subarray_off_len (varp->ndim, varp->chunkdim, tssizep, tstartp, &poff, + &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (sbuf_re[j] + packoff, varp->chunk_cache[cid]->buf + poff, plen); + packoff += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tssizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) +#ifdef PNETCDF_PROFILING + ncchkp->nreq++; +#endif + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); +#ifdef PNETCDF_PROFILING + ncchkp->sendsize += packoff; +#endif + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + vid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + k = reqs[j][rcnt_local[j]++]; + r = reqs[j][rcnt_local[j]++]; + req = ncchkp->getlist.reqs + reqids[k]; + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { + tstartp[k] += (int)(citr[k] - req->starts[r][k]); + tsize[k] = req->counts[r][k]; + } + + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssizep, tstartp, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (req->xbufs[r] + poff, rbuf_re[j] + packoff, plen); + packoff += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, req->xbufs[r], 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (tbuf); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (reqs[i]); } + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + NCI_Free (reqs); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (rlo_local); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iput.c b/src/drivers/ncchunkio/ncchkioi_iput.c new file mode 100644 index 000000000..98e97633b --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iput.c @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +static inline int ncchkioi_init_put_req (NC_chk *ncchkp, + NC_chk_req *req, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const void *xbuf, + const void *buf) { + int err=NC_NOERR; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset (req, 0, sizeof (NC_chk_req)); + + // Record request + req->starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *)); + req->start = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + req->starts[0] = req->start; + memcpy (req->start, start, sizeof (MPI_Offset) * varp->ndim); + req->counts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *)); + req->count = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + req->counts[0] = req->count; + memcpy (req->count, count, sizeof (MPI_Offset) * varp->ndim); + if (stride != NULL) { + req->stride = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + memcpy (req->stride, stride, sizeof (MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void *)buf; + req->xbuf = (void *)xbuf; + req->xbufs = (char **)NCI_Malloc (sizeof (char *)); + req->xbufs[0] = req->xbuf; + req->nreq = 1; + + return err; +} + +int ncchkioi_iput_var (NC_chk *ncchkp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const void *xbuf, + const void *buf, + int *reqid) { + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + err = ncchkioi_init_put_req (ncchkp, &req, varid, start, count, stride, xbuf, buf); + + // Add to req list + ncchkioi_req_list_add (&(ncchkp->putlist), &req_id); + ncchkp->putlist.reqs[req_id] = req; + + if (reqid != NULL) { *reqid = req_id * 2 + 1; } + + return err; +} + +static inline int ncchkioi_init_put_varn_req (NC_chk *ncchkp, + NC_chk_req *req, + int varid, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *xbuf, + const void *buf) { + int err=NC_NOERR; + int i, j; + MPI_Offset rsize, boff; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset (req, 0, sizeof (NC_chk_req)); + + // Record request + req->starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nreq); + CHK_PTR (req->starts) + req->start = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + CHK_PTR (req->start) + for (i = 0; i < nreq; i++) { + req->starts[i] = req->start + i * varp->ndim; + memcpy (req->starts[i], starts[i], sizeof (MPI_Offset) * varp->ndim); + } + req->counts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nreq); + CHK_PTR (req->counts) + req->count = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + CHK_PTR (req->count) + for (i = 0; i < nreq; i++) { + req->counts[i] = req->count + i * varp->ndim; + memcpy (req->counts[i], counts[i], sizeof (MPI_Offset) * varp->ndim); + } + + // Calculate buffer for each individual request + req->xbufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + CHK_PTR (req->xbufs) + boff = 0; + for (i = 0; i < nreq; i++) { + req->xbufs[i] = (((char *)xbuf) + boff); + + // Advance pointer by size of the request + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + boff += rsize; + } + + req->varid = varid; + req->buf = (void *)buf; + req->xbuf = (void *)xbuf; + req->nreq = nreq; + +err_out:; + return err; +} + +int ncchkioi_iput_varn (NC_chk *ncchkp, + int varid, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *xbuf, + const void *buf, + int *reqid) { + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + if (nreq > 1) { + err = ncchkioi_init_put_varn_req (ncchkp, &req, varid, nreq, starts, counts, xbuf, buf); + } else { + err = ncchkioi_init_put_req (ncchkp, &req, varid, starts[0], counts[0], NULL, xbuf, buf); + } + CHK_ERR + + // Add to req list + err = ncchkioi_req_list_add (&(ncchkp->putlist), &req_id); + CHK_ERR + ncchkp->putlist.reqs[req_id] = req; + + if (reqid != NULL) { *reqid = req_id * 2 + 1; } + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iput_cb.c b/src/drivers/ncchunkio/ncchkioi_iput_cb.c new file mode 100644 index 000000000..1abf48eb2 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iput_cb.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iput_cb_chunk (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int i, j; + int vid; // Iterators for variable id + int *nreqs; // Number of reqids in each variable + int *nums; // Number of reqs in each varn + int **vreqids; + int num, maxnum = 0; + MPI_Offset **starts, **counts; + char **bufs; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Count total number of request in per variable for packed varn request + nums = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 2); + nreqs = nums + ncchkp->vars.cnt; + memset (nums, 0, sizeof (int) * ncchkp->vars.cnt); + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + nreqs[req->varid]++; + nums[req->varid] += req->nreq; + } + + /* Allocate a skip list of reqids for each vriable + * At the same time, we find out the number of starts and counts we need to allocate + */ + vreqids = (int **)NCI_Malloc (sizeof (int *) * ncchkp->vars.cnt); + vreqids[0] = (int *)NCI_Malloc (sizeof (int) * nreq); + maxnum = 0; + i = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Assign buffer to reqid skip list + vreqids[vid] = vreqids[0] + i; + i += nreqs[vid]; + + // maximum number of starts and counts we need across all variables + if (maxnum < nums[vid]) { maxnum = nums[vid]; } + } + } + + // Fill up the skip list + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + vreqids[req->varid][nreqs[req->varid]++] = reqids[i]; + } + + // Allocate parameters + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * maxnum * 2); + counts = starts + maxnum; + bufs = (char **)NCI_Malloc (sizeof (char *) * maxnum); + + /* Pack requests variable by variable + */ + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Collect parameters + num = 0; + for (j = 0; j < nreqs[vid]; j++) { + req = ncchkp->putlist.reqs + vreqids[vid][j]; + + for (i = 0; i < req->nreq; i++) { + starts[num] = req->starts[i]; + counts[num] = req->counts[i]; + bufs[num++] = req->xbufs[i]; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB_INIT) + + // Perform collective buffering + ncchkioi_put_varn_cb_chunk (ncchkp, ncchkp->vars.data + vid, num, starts, counts, NULL, + (void **)bufs); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + } + } + + // Free buffers + NCI_Free (nums); + + NCI_Free (vreqids[0]); + NCI_Free (vreqids); + + NCI_Free (starts); + NCI_Free (bufs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + return NC_NOERR; +} + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iput_cb_proc (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator and owner + int vid; + int r; + MPI_Offset *ostart = NULL, *osize; + int *tsize, *tssize, *tstart = NULL, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local = NULL, *wcnt_all; // Number of processes that writes to each chunk + + int nread; + int *rlo_local, *rhi_local; + int *rlo_all, *rhi_all; + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Offset pboff; // Offset of buffer to pack to/ from + MPI_Datatype ptype; // Pack datatype + int plen; + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq = NULL; // Send and recv req + MPI_Status *sstat = NULL, rstat; // Send and recv status + char **sbuf = NULL, **rbuf = NULL; // Send and recv buffer + char **sbufp, **rbufp; // Send and recv buffer pointer + int *rsize = NULL, *ssize = NULL; // Send and recv size of each message + MPI_Offset totalsize; + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + NC_chk_var *varp; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + CHK_PTR (wcnt_local) + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Intermediate buffer for our own data + tbuf = (char *)NCI_Malloc (ncchkp->max_chunk_size); + CHK_PTR (tbuf) + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * ncchkp->max_ndim * 3); + CHK_PTR (tstart) + tssize = tstart + ncchkp->max_ndim; + tsize = tssize + ncchkp->max_ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + CHK_PTR (ostart) + osize = ostart + ncchkp->max_ndim; + + // Chunk iterator + citr = osize + ncchkp->max_ndim; + + // Access range + rlo_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 5); + CHK_PTR (rlo_local) + rhi_local = rlo_local + ncchkp->vars.cnt; + rlo_all = rhi_local + ncchkp->vars.cnt; + rhi_all = rlo_all + ncchkp->vars.cnt; + rids = rhi_all + ncchkp->vars.cnt; + + for (i = 0; i < ncchkp->vars.cnt; i++) { + rlo_local[i] = 2147483647; + rhi_local[i] = -1; + } + + // We need to calculate the size of message of each processes + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init (varp, req->starts[r], req->counts[r], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + if (rlo_local[req->varid] > cid) { rlo_local[req->varid] = cid; } + if (rhi_local[req->varid] < cid) { rhi_local[req->varid] = cid; } + } while (ncchkioi_chunk_itr_next (varp, req->starts[r], req->counts[r], citr, &cid)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request form self + + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_local[i] *= -1; } + CHK_ERR_ALLREDUCE (rlo_local, rlo_all, ncchkp->vars.cnt * 2, MPI_INT, MPI_MIN, ncchkp->comm); + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_all[i] *= -1; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + CHK_PTR (sbuf) + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + CHK_PTR (ssize) + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + CHK_PTR (rbuf) + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + CHK_PTR (rreq) + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 2); +#ifdef PNETCDF_DEBUG + if (ssize[j] < 0) { RET_ERR (NC_EAINT_TOO_SMALL) } +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + // Allocate buffer for send + totalsize = 0; + for (i = 0; i < nsend; i++) { +#ifdef PNETCDF_DEBUG + assert (ssize[i] >= 0); +#endif + totalsize += ssize[i]; + } + if (nsend > 0) { + sbuf[0] = sbufp[0] = (char *)NCI_Malloc (totalsize); + CHK_PTR (sbuf[0]) + for (i = 1; i < nsend; i++) { sbuf[i] = sbufp[i] = sbuf[i - 1] + ssize[i - 1]; } + } +#ifdef PNETCDF_PROFILING + ncchkp->nsend += nsend; + ncchkp->sendsize += totalsize; +#endif + + // Pack requests + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + cown = varp->chunk_owner[cid]; + + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Pack metadata + *((int *)(sbufp[j])) = req->varid; + sbufp[j] += sizeof (int); + *((int *)(sbufp[j])) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type from user buffer to send buffer + for (i = 0; i < varp->ndim; i++) { + tsize[i] = (int)req->counts[r][i]; + tstart[i] = (int)(ostart[i] - req->starts[r][i]); + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssizep, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; +#ifdef PNETCDF_DEBUG + if (sbufp[j] - sbuf[j] + plen > ssize[j]) { RET_ERR (NC_EINTERNAL) } +#endif + memcpy (sbufp[j], req->xbufs[r] + pboff, plen); + sbufp[j] += plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + packoff = 0; + CHK_ERR_PACK (req->xbufs[r], 1, ptype, sbufp[j], ssize[j], &packoff, + ncchkp->comm); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + +#ifdef PNETCDF_PROFILING + ncchkp->nremote++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + if (ssize[i] == 24 && sdst[i] == 983) { + printf ("rank %d: wrong ssize to 983\n", ncchkp->rank); + abort (); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + +#ifdef PNETCDF_PROFILING + ncchkp->recvsize += rsize[i]; +#endif + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } +#ifdef PNETCDF_PROFILING + ncchkp->nrecv += nrecv; +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (rhi_all[i] >= rlo_all[i]) { + varp = ncchkp->vars.data + i; + rids[nread] = i; + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rlo_all[i]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rhi_all[i]; k++) + ; + rlo_all[nread] = j; + rhi_all[nread++] = k; + } + } + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_BARR) +#endif + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB) + err = ncchkioi_load_nvar_bg (ncchkp, nread, rids, rlo_all, rhi_all); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future + // allocation + (ncchkp->cache_serial) + ++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - req->starts[r][j]); + tsize[j] = (int)req->counts[r][j]; + tssize[j] = (int)osize[j]; + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; + memcpy (tbuf, req->xbufs[r] + pboff, plen); + overlapsize = plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (req->xbufs[r], 1, ptype, tbuf, varp->chunksize, &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + } + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; + memcpy (varp->chunk_cache[cid]->buf + pboff, tbuf, plen); + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + // Mark chunk as dirty + varp->dirty[cid] = 1; +#ifdef PNETCDF_PROFILING + ncchkp->nlocal++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + packoff = 0; + while (rbufp[j] - rbuf[j] < rsize[j]) { + // Retrieve metadata + vid = *((int *)(rbufp[j])); + rbufp[j] += sizeof (int); + cid = *((int *)(rbufp[j])); + rbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + err = ncchkioi_subarray_off_len (varp->ndim, varp->chunkdim, tssizep, tstartp, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; +#ifdef PNETCDF_DEBUG + if (rbufp[j] - rbuf[j] + plen > rsize[j]) { RET_ERR (NC_EINTERNAL) } +#endif + memcpy (varp->chunk_cache[cid]->buf + pboff, rbufp[j], plen); + rbufp[j] += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tssizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + } + + // Mark chunk as dirty + varp->dirty[cid] = 1; + +#ifdef PNETCDF_PROFILING + ncchkp->nreq++; +#endif + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + +err_out:; + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + if (nsend > 0) { NCI_Free (sbuf[0]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rlo_local); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_lagacy.c b/src/drivers/ncchunkio/ncchkioi_lagacy.c new file mode 100644 index 000000000..811a7e68c --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_lagacy.c @@ -0,0 +1,719 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +#define min(a,b) (((a)<(b))?(a):(b)) +#define max(a,b) (((a)>(b))?(a):(b)) + +int +ncchkioi_get_var_old(NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) +{ + int i, j, err=NC_NOERR; + nc_type xtype; + int *cstart, *cend, *ccord; + int nb, bsize; + int datavarid; + int *bidx; + int *tsize, *tssize, *tstart; + int tpos; + MPI_Datatype subarytype; + char *rbuffer, *cbuffer; + MPI_Offset cbsize; + MPI_Offset **starts, **counts; + + // Boundary of chunks involved + cstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + ccord = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + cend = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + cstart[i] = start[i] / varp->chunkdim[i]; + if (stride == NULL){ + cend[i] = (start[i] + count[i] - 1) / varp->chunkdim[i]; + } + else{ + cend[i] = (start[i] + (count[i] - 1) * stride[i]) / varp->chunkdim[i] + 1; + } + } + + // Number of chunks involved + nb = 1; + for(i = 0; i < varp->ndim; i++){ + nb *= cend[i] - cstart[i]; + } + + /* Use a varn call to read all compressed chunk involved + * Generate one request for each chunk + */ + + bidx = (int*)NCI_Malloc(sizeof(int) * nb); + starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nb); + counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nb); + // Iterate through all chunks involved + i = 0; + cbsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + for(i = 0; i < nb; i++){ + j = get_chunk_idx(varp, ccord); + bidx[i] = j; // chunk idx + cbsize += varp->data_lens[j]; // total buffer size of compressed data + starts[i] = varp->chunk_index + j; // start of the chunk + counts[i] = varp->data_lens + j; // count of the chunk + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + // Allocate buffers + cbuffer = (char*)NCI_Malloc(cbsize); // Compressed data + + // Locate data var + err = ncchkp->driver->get_var(ncchkp->ncp, varp->varid, NULL, NULL, NULL, NULL, &datavarid, 1, MPI_INT, reqMode); + if (err != NC_NOERR) return err; + + // read compressed data + err = ncchkp->driver->get_varn(ncchkp->ncp, datavarid, nb, starts, counts, cbuffer, cbsize, MPI_BYTE, reqMode); + if (err != NC_NOERR) return err; + + // Decompression + + // Calculate chunk size + // Original datatype + err = ncchkp->driver->get_att(ncchkp->ncp, varp->varid, "_datatype", &xtype, MPI_INT); + if (err != NC_NOERR) return err; + + // Calculate chunk size + bsize = (int)NC_Type_size(xtype); + for(i = 0; i < varp->ndim; i++){ + bsize *= varp->chunkdim[i]; + } + + // Allocate buffers + rbuffer = NCI_Malloc(bsize * nb); // Decompressed data + + // Decompress chunks + cbsize = 0; + for(i = 0; i < nb; i++){ + j = bidx[i]; + if (varp->data_lens[j] > 0){ + varp->filter_driver->decompress(cbuffer + cbsize, varp->data_lens[j], rbuffer + bsize * i, NULL, varp->ndim, varp->dimsize, ncmpii_nc2mpitype(xtype)); + } + else{ + memset(rbuffer + bsize * i, 0, bsize); + } + cbsize += varp->data_lens[j]; // move to next chunk location + } + + // Copy data into user buffer + + // Create datatype of querying domain in the decompressed domain + tsize = NCI_Malloc(sizeof(int) * varp->ndim); + tssize = NCI_Malloc(sizeof(int) * varp->ndim); + tstart = NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + tsize[i] = (cend[i] - cstart[i]) * varp->chunkdim[i]; + tssize[i] = (int)count[i]; + tstart[i] = start[i] % varp->chunkdim[i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &subarytype); + CHK_ERR_TYPE_COMMIT(&subarytype); + + // Pack data into user buffer + tpos = 0; + CHK_ERR_PACK(rbuffer, bsize * nb, subarytype, buf, bsize * nb, &tpos, ncchkp->comm); + + // Free datatype + MPI_Type_free(&subarytype); + + return NC_NOERR; +} + +int +ncchkioi_put_var_old(NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) +{ + int i, j, k, err=NC_NOERR; + nc_type xtype; // Variable data type in NC + MPI_Datatype etype; // Variable element type in MPI + int esize; // Variable element size + int *cstart, *cend, *ccord; // Bounding box for chunks overlapping my own write region + int nb, bsize; //number of chunks this process write to and chunk size + int datavarid; // Id of data variable + int *tsize, *tssize, *tstart; // Size for sub-array type + int nmychunks, *mychunks; // chunk count and id this process handles + int *sendcounts, *sdispls; // Send count and displacements in buffer + int *recvcounts, *rdispls; // Receive count and displacement in buffer + int *packoff; // Offset in mpi packing + int *zipsize, *zdispls; // Compressed count and displacement of my chunks in buffer + int *zsize_local, *zsize_all; // Compressed size of all chunks at local and global (all processes) + int *zdispls_all; // Compressed displacement of all chunks (all processes) + int overlapsize; // Size of overlapping region between a chunk and write region + MPI_Datatype ptype; // Pack datatype + char *zbuf, *xbuf; // Compressed and uncompressed data buffer + char *sbuf, *rbuf; // Send and receive buffer + MPI_Offset **start_all, **count_all, **stride_all; // Start, count, stride of all processes + char name[128]; // Name of objects + int zdimid; // dimension id for compressed data variable + MPI_Offset **zstarts, **zcounts; // Starts and counts in the varn call for compressed data + + // Original datatype and size + err = ncchkp->driver->get_att(ncchkp->ncp, varp->varid, "_datatype", &xtype, MPI_INT); + if (err != NC_NOERR) return err; + esize = NC_Type_size(xtype); + etype = ncmpii_nc2mpitype(xtype); + + // Calculate chunk size + bsize = esize; + for(i = 0; i < varp->ndim; i++){ + bsize *= varp->chunkdim[i]; + } + + // Allocate buffering for overlaping index + tsize = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + tssize = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + tstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + + /* + * Gather start, count, stride to all processes + */ + + // Allocate buffer + + start_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + count_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + stride_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + + start_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + count_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + stride_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + + for(i = 1; i < ncchkp->np; i++){ + start_all[i] = start_all[0] + i * varp->ndim; + count_all[i] = count_all[0] + i * varp->ndim; + stride_all[i] = stride_all[0] + i * varp->ndim; + } + + // Call allgather + + err = MPI_Allgather(start, varp->ndim, MPI_LONG_LONG_INT, start_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + + if (count != NULL){ + err = MPI_Allgather(count, varp->ndim, MPI_LONG_LONG_INT, count_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + } + + if (stride != NULL){ + err = MPI_Allgather(stride, varp->ndim, MPI_LONG_LONG_INT, stride_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + } + + /* + * Now, we need to send data to the chunk owner as well as receive data for our own chunk + */ + + // First, compute chunk boundary, find overlapping chunks + cstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + ccord = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + cend = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + cstart[i] = start[i] / varp->chunkdim[i]; + if (stride == NULL){ + cend[i] = (start[i] + count[i] - 1) / varp->chunkdim[i] + 1; + } + else{ + cend[i] = (start[i] + (count[i] - 1) * stride[i]) / varp->chunkdim[i] + 1; + } + } + + // Calculate the amount we need to send to other process + sendcounts = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + sdispls = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + packoff = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + memset(sendcounts, 0, sizeof(int) * ncchkp->np); + memset(packoff, 0, sizeof(int) * ncchkp->np); + + // Iterate through all chunks involved to count send size + i = 0; + overlapsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + while(ccord[0] < cend[0]){ + j = varp->chunk_owner[get_chunk_idx(varp, ccord)]; + + // Overlapping size of this chunk + overlapsize = get_chunk_overlap(varp, ccord, start, count, stride, tstart, tssize); + sendcounts[j] += overlapsize; + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + // Buffer displacement + sdispls[0] = 0; + for(i = 1; i < ncchkp->np; i++){ + sdispls[i] = sendcounts[i - 1] + sdispls[i - 1]; + } + + // Allocate send buffer + sbuf = (char*)NCI_Malloc(sdispls[ncchkp->np - 1] + sendcounts[ncchkp->np - 1]); + + // Pack data into send buffer + + // Iterate through all chunks involved again, this time actually pack the data + for(i = 0; i < varp->ndim; i++){ + tsize[i] = (int)count[i]; + } + i = 0; + overlapsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + while(ccord[0] < cend[0]){ + j = varp->chunk_owner[get_chunk_idx(varp, ccord)]; + + // Overlapping region of this chunk + get_chunk_overlap(varp, ccord, start, count, stride, tstart, tssize); + for(k = 0; k < varp->ndim; k++){ + tstart[k] -= (int)start[k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &ptype); + CHK_ERR_TYPE_COMMIT(&ptype); + + // Pack data + CHK_ERR_PACK(buf, 1, ptype, sbuf + sdispls[j], sendcounts[j], packoff + j, ncchkp->comm); + + // Free datatype + MPI_Type_free(&ptype); + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + /* + * Determine chunk ownership + * Find my chunks + */ + nmychunks = 0; + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + nmychunks++; + } + } + + // Gather chunk id this process handled to prevent a search in the future + mychunks = (int*)NCI_Malloc(sizeof(int) * nmychunks); + nmychunks = 0; + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + mychunks[nmychunks] = i; + nmychunks++; + } + } + + /* + * Compute size to receive + * We only need size here, packing will happen after receving + */ + + // Calculate the amount we need to receive from other process + recvcounts = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + rdispls = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + memset(recvcounts, 0, sizeof(int) * ncchkp->np); + memset(packoff, 0, sizeof(int) * ncchkp->np); + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + get_chunk_cord(varp, i, ccord); + + for(j = 0; j < ncchkp->np; j++){ + // Overlapping region of this chunk + get_chunk_overlap(varp, ccord, start_all[j], count_all[j], stride_all[j], tstart, tssize); + + overlapsize = esize; + for(k = 0; k < varp->ndim; k++){ + overlapsize *= tssize[k]; + } + recvcounts[j] += overlapsize; + } + } + } + + // Buffer displacement + rdispls[0] = 0; + for(i = 1; i < ncchkp->np; i++){ + rdispls[i] = recvcounts[i - 1] + rdispls[i - 1]; + } + + // Allocate receive buffer + rbuf = (char*)NCI_Malloc(rdispls[ncchkp->np - 1] + recvcounts[ncchkp->np - 1]); + + // Send the data to destination + MPI_Alltoallv(sbuf, sendcounts, sdispls, MPI_BYTE, rbuf, recvcounts, rdispls, MPI_BYTE, ncchkp->comm); + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: sendcount = {", ncchkp->rank); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", sendcounts[i]); + } + printf("}, sdispls = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", sdispls[i]); + } + printf("}, recvcounts = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", recvcounts[i]); + } + printf("}, rdispls = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", rdispls[i]); + } + printf("}, sbuf = {"); + for(i = 0; i < sdispls[ncchkp->np - 1] + sendcounts[ncchkp->np - 1]; i++){ + printf("%x ", sbuf[i]); + } + printf("}, rbuf = {"); + for(i = 0; i < rdispls[ncchkp->np - 1] + recvcounts[ncchkp->np - 1]; i++){ + printf("%x ", rbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * Next step is to pack data to chunk buffer + */ + + // Allocate buffer + xbuf = (char*)NCI_Malloc(nmychunks * bsize); + + // Main array is the whole chunk + for(i = 0; i < varp->ndim; i++){ + tsize[i] = varp->chunkdim[i]; + } + + // Pack data + memset(packoff, 0, sizeof(int) * ncchkp->np); + for(i = 0; i < nmychunks; i++){ + get_chunk_cord(varp, mychunks[i], ccord); + + for(j = 0; j < ncchkp->np; j++){ + // Overlapping region of this chunk + overlapsize = get_chunk_overlap(varp, ccord, start_all[j], count_all[j], stride_all[j], tstart, tssize); + + if (overlapsize > 0){ + // Overlap size + //overlapsize = esize; + //for(k = 0; k < varp->ndim; k++){ + // overlapsize *= tssize[k]; + //} + + // The chunk is the main array, overlapping region is the subarray + for(k = 0; k < varp->ndim; k++){ + tstart[k] -= ccord[k] * varp->chunkdim[k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &ptype); + CHK_ERR_TYPE_COMMIT(&ptype); + + // Pack data + CHK_ERR_UNPACK(rbuf + rdispls[j], overlapsize, packoff + j, xbuf + bsize * i, 1, ptype, ncchkp->comm); + + // Free datatype + MPI_Type_free(&ptype); + } + } + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: xbuf = {", ncchkp->rank); + for(i = 0; i < nmychunks * bsize; i++){ + printf("%x ", xbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * The buffer is now filled with data coming from all processes, it's time to compress + */ + + // compressed size and displacement + zipsize = (int*)NCI_Malloc(sizeof(int) * nmychunks); + zdispls = (int*)NCI_Malloc(sizeof(int) * (nmychunks + 1)); + memset(zipsize, 0, sizeof(int) * nmychunks); + memset(zdispls, 0, sizeof(int) * (nmychunks + 1)); + + // Calculate compressed data size + for(i = 0; i < nmychunks; i++){ + // Calculate compressed size + // This is just estimate + varp->filter_driver->compress(xbuf + bsize * i, bsize, NULL, zipsize + i, varp->ndim, varp->chunkdim, etype); + } + + // Calculate total size + for(i = 1; i < nmychunks; i++){ + zdispls[0] += zipsize[i]; + } + + // Allocate buffer + zbuf = (char*)NCI_Malloc(zdispls[0]); + + // Perform real compression + for(i = 0; i < nmychunks; i++){ + // Compressed the data + // We get real size here + varp->filter_driver->compress(xbuf + bsize * i, bsize, zbuf + zdispls[i], zipsize + i, varp->ndim, varp->chunkdim, etype); + + // Calculate offset + zdispls[i + 1] = zdispls[i] + zipsize[i]; + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: zipsize = {", ncchkp->rank); + for(i = 0; i < nmychunks; i++){ + printf("%x ", zipsize[i]); + } + printf("}, zdispls = {"); + for(i = 0; i < nmychunks; i++){ + printf("%d, ", zdispls[i]); + } + printf("}, zbuf = {"); + for(i = 0; i < zdispls[nmychunks- 1] + zipsize[nmychunks - 1]; i++){ + printf("%x ", zbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * Now it is time for a collective write + * We start by syncing compressed size on all processes + * Then, we can create variable large enough to store compressed data + * Finally, we do collective write to store the data + */ + + // First sync on compressed chunk size + // We use a all MAX reduce on all chunks + // An alternative is to allgather and unpack the info + + // Allocate buffer + zsize_local = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + zsize_all = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + zdispls_all = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + memset(zsize_local, 0, sizeof(int) * varp->nchunk); + memset(zsize_all, 0, sizeof(int) * varp->nchunk); + memset(zdispls_all, 0, sizeof(int) * varp->nchunk); + + // Fill up local size + for(i = 0; i < nmychunks; i++){ + zsize_local[mychunks[i]] = zipsize[i]; + } + + // All reduce + CHK_ERR_ALLREDUCE(zsize_local, zsize_all, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm); + + // Calculate variable displacement + zdispls_all[0] = 0; + for(i = 1; i < varp->nchunk; i++){ + zdispls_all[i] = zsize_all[i - 1] + zdispls_all[i - 1]; + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: zsize_all = {", ncchkp->rank); + for(i = 0; i < varp->nchunk; i++){ + printf("%x ", zsize_all[i]); + } + printf("}, zdispls_all = {"); + for(i = 0; i < varp->nchunk; i++){ + printf("%d, ", zdispls_all[i]); + } + printf("}, varid = { %d", varp->varid); + printf("}, datavarid = { %d", varp->datavarid); + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + // Enter redefine mode + ncchkp->driver->redef(ncchkp->ncp); + + // Define dimension for data variable + sprintf(name, "_compressed_data_dim_%d", varp->varid); + err = ncchkp->driver->def_dim(ncchkp->ncp, name, zdispls_all[varp->nchunk - 1] + zsize_all[varp->nchunk - 1], &zdimid); + if (err != NC_NOERR) return err; + + // Define variable + sprintf(name, "_compressed_data_%d", varp->varid); + err = ncchkp->driver->def_var(ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &(varp->datavarid)); + if (err != NC_NOERR) return err; + + // Record offset in data variable + err = ncchkp->driver->put_att(ncchkp->ncp, varp->varid, "_chunkoffset", NC_INT, varp->nchunk, zdispls_all, MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + + // Switch to data mode + err = ncchkp->driver->enddef(ncchkp->ncp); + if (err != NC_NOERR) return err; + + //Now, we generate a varn call to write out compressed data + zstarts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nmychunks); + zcounts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nmychunks); + zstarts[0] = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * nmychunks); + zcounts[0] = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * nmychunks); + for(i = 0; i < nmychunks; i++){ + zstarts[i] = zstarts[0] + i; + zcounts[i] = zcounts[0] + i; + zstarts[i][0] = zdispls_all[mychunks[i]]; + zcounts[i][0] = zsize_all[mychunks[i]]; + } + err = ncchkp->driver->put_varn(ncchkp->ncp, varp->datavarid, nmychunks, zstarts, zcounts, zbuf, zdispls[nmychunks - 1] + zipsize[nmychunks - 1], MPI_UNSIGNED_CHAR, NC_REQ_WR | NC_REQ_BLK | NC_REQ_FLEX | NC_REQ_COLL); + if (err != NC_NOERR) return err; + + // Record datavar id + err = ncchkp->driver->put_var(ncchkp->ncp, varp->varid, NULL, NULL, NULL, NULL, &(varp->datavarid), 1, MPI_INT, NC_REQ_WR | NC_REQ_BLK | NC_REQ_FLEX | NC_REQ_COLL); + if (err != NC_NOERR) return err; + + // Free up buffers + NCI_Free(cstart); + NCI_Free(cend); + NCI_Free(ccord); + NCI_Free(tsize); + NCI_Free(tssize); + NCI_Free(tstart); + NCI_Free(mychunks); + NCI_Free(sendcounts); + NCI_Free(sdispls); + NCI_Free(recvcounts); + NCI_Free(rdispls); + NCI_Free(packoff); + NCI_Free(zipsize); + NCI_Free(zdispls); + NCI_Free(zsize_local); + NCI_Free(zsize_all); + NCI_Free(zdispls_all); + NCI_Free(zbuf); + NCI_Free(xbuf); + NCI_Free(sbuf); + NCI_Free(rbuf); + NCI_Free(start_all[0]); + NCI_Free(count_all[0]); + NCI_Free(stride_all[0]); + NCI_Free(start_all); + NCI_Free(count_all); + NCI_Free(stride_all); + NCI_Free(zstarts[0]); + NCI_Free(zcounts[0]); + NCI_Free(zstarts); + NCI_Free(zcounts); + + return NC_NOERR; +} + +void profile(){ + /* Profiling information */ + ncchkp->profile.total_data += t9 - t0; + ncchkp->profile.total_meta += t9 - t0; + ncchkp->profile.max_buffer += t9 - t0; + ncchkp->profile.total_time += t9 - t0; + ncchkp->profile.cb_time += t9 - t0; + ncchkp->profile.io_time += t9 - t0; + + ncchkp->profile.cb_init_time += t9 - t0; // Calculate number of req + ncchkp->profile.cb_sync_time += t9 - t0; // Syncing number of req + ncchkp->profile.cb_pack_req_time += t9 - t0; // Pack request and reply + ncchkp->profile.cb_pack_rep_time += t9 - t0; // Pack request and reply + ncchkp->profile.cb_unpack_req_time += t9 - t0; // Unpack incoming request + ncchkp->profile.cb_unpack_rep_time += t9 - t0; // Unpack incoming request + ncchkp->profile.cb_send_req_time += t9 - t0; // Posting and waiting send + ncchkp->profile.cb_send_rep_time += t9 - t0; // Posting and waiting send + ncchkp->profile.cb_recv_req_time += t9 - t0; // Time posting and waiting recv + ncchkp->profile.cb_recv_rep_time += t9 - t0; // Time posting and waiting recv + ncchkp->profile.cb_self_time += t9 - t0; // Time handling our own data + + ncchkp->profile.io_wr_time += t9 - t0; + ncchkp->profile.io_rd_time += t9 - t0; + ncchkp->profile.io_com_time += t9 - t0; + ncchkp->profile.io_decom_time += t9 - t0; + ncchkp->profile.io_sync_time += t9 - t0; +} diff --git a/src/drivers/ncchunkio/ncchkioi_lists.c b/src/drivers/ncchunkio/ncchkioi_lists.c new file mode 100644 index 000000000..d89b510aa --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_lists.c @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +int ncchkioi_var_list_init(NC_chk_var_list *list) { + list->cnt = 0; + list->nalloc = 0; + return NC_NOERR; +} + +int ncchkioi_var_list_free(NC_chk_var_list *list) { + int i; + if (list->nalloc > 0){ + for(i = 0; i < list->cnt; i++){ + ncchkioi_var_free(list->data + i); + } + NCI_Free(list->data); + } + return NC_NOERR; +} + +int ncchkioi_var_list_add(NC_chk_var_list *list) { + if (list->nalloc == 0){ + list->nalloc = 16; + list->data = NCI_Malloc(list->nalloc * sizeof(NC_chk_var)); + CHK_ALLOC(list->data) + } + else if (list->nalloc == list->cnt){ + list->nalloc *= 2; + list->data = NCI_Realloc(list->data, list->nalloc * sizeof(NC_chk_var)); + CHK_ALLOC(list->data) + } + + return ((list->cnt)++); +} diff --git a/src/drivers/ncchunkio/ncchkioi_nonblocking.c b/src/drivers/ncchunkio/ncchkioi_nonblocking.c new file mode 100644 index 000000000..03b6dbbeb --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_nonblocking.c @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#define PUT_ARRAY_SIZE 128 /* Size of initial put list */ +#define SIZE_MULTIPLIER 2 /* When metadata buffer is full, we'll NCI_Reallocate it to META_BUFFER_MULTIPLIER times the original size*/ + +/* getlist is a module in ADIOS driver that manage nonblocking get request object + * It consist of a pool of request object (reqs) and request ids (ids) + * It's implemented by 3 array of the same number of entries + * The id i corresponds to the i-th request object + * We issue request object by issuing the corresponding request id + * ids is initialized with increasing id, ie. ids[i] = i + * ids are issued from the begining of ids array + * We keep track of location of id in ids array in pos array. Initially, pos[i] = i + * A pointer nused keep track of issued ids, it also marks the position of next unused ready to be issued + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + * When issuing an id, we take from ids from the position marked by nused and increase nused by 1 + * When recycling an id, we swap if with the right before position marked by nused and decrease nused by 1 so that it falls to unused pool + * NOTE: We does not guarantee to issue id in continuous and increasing order + * NOTE: ids is simply a pool housing reqeust ids, the position od id within ids is not fixed and has no meaning + * + * Eaxmple: + * Initial: + * ids = 0 1 2 3 + * ^ + * nused = 0 + * After issuing 2 ids: + * undefined|Avaiable ids ---> + * ids = 0 1 2 3 + * ^ + * nused = 2 + * Recycling id 0 + * |Avaiable ids ---> + * ids = 1 0 2 3 + * ^ + * nused = 1 +* Recycling id 1 + * |Avaiable ids ---> + * ids = 1 0 2 3 + * ^ + * nused = 0 + */ + +/* + * Initialize the put list + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + */ +int ncchkioi_req_list_init(NC_chk_req_list *lp) { + int err=NC_NOERR; + int i; + + /* Initialize parameter and allocate the array */ + lp->nused = 0; + lp->nalloc = PUT_ARRAY_SIZE; + lp->reqs = (NC_chk_req*)NCI_Malloc(lp->nalloc * sizeof(NC_chk_req)); + lp->ids = (int*)NCI_Malloc(lp->nalloc * SIZEOF_INT); + CHK_PTR(lp->ids) + lp->pos = (int*)NCI_Malloc(lp->nalloc * SIZEOF_INT); + CHK_PTR(lp->pos) + if (lp->reqs == NULL || lp->ids == NULL) { + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + /* Initialize values of ids and reqs + * Assign increasing unique id + */ + for (i=0; inalloc; i++) { + lp->ids[i] = i; // Unique ids + lp->pos[i] = i; // Not in use + } + +err_out:; + return err; +} + +/* + * Enlarge the put list + * When there are no more unused ids to issue, we must add more ids to the pool + * We simply enlarge ids and reqs array + * We initialize the extended part as usual + */ +static int ncchkioi_req_list_resize(NC_chk_req_list *lp) +{ + int i; + size_t nsize; + void *ptr; + + /* Calculate new size */ + nsize = lp->nalloc * SIZE_MULTIPLIER; + + /* Realloc reqs and ids */ + ptr = NCI_Realloc(lp->reqs, nsize * sizeof(NC_chk_req)); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->reqs = (NC_chk_req*)ptr; + + ptr = NCI_Realloc(lp->ids, nsize * SIZEOF_INT); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->ids = (int*)ptr; + + ptr = NCI_Realloc(lp->pos, nsize * SIZEOF_INT); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->pos = (int*)ptr; + + /* Initialize values of ids and reqs + * Assign increasing unique id + */ + for (i=lp->nalloc; iids[i] = i; // Unique ids + lp->pos[i] = i; // Default position + } + + lp->nalloc = nsize; + + return NC_NOERR; +} + +/* + * Clean up the put list + */ +int ncchkioi_req_list_free(NC_chk_req_list *lp) +{ + NCI_Free(lp->reqs); + NCI_Free(lp->ids); + NCI_Free(lp->pos); + + return NC_NOERR; +} + +/* + * Allocate a new request object from the getlist with id + * We first check if there are unused ids + * We increase the size of pool, bringing in new ids if there aren't + * Then we issue the ids at position nused and increase it by 1 + */ +int ncchkioi_req_list_add(NC_chk_req_list *lp, int *id) +{ + int err=NC_NOERR; + + /* Increase size if necessary */ + if (lp->nused == lp->nalloc) { + err = ncchkioi_req_list_resize(lp); + if (err != NC_NOERR) return err; + } + + /* Get the first unused id marked by nused */ + *id = lp->ids[lp->nused++]; + + return NC_NOERR; +} + +/* + * Recycle a request object in the put list + * We need to maintain the position of each request id in the ids list + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + */ +int ncchkioi_req_list_remove(NC_chk_req_list *lp, int reqid) { + NC_chk_req * req = lp->reqs + reqid; + + /* Clean up request */ + if (req->start != NULL){ + NCI_Free(req->start); + } + if (req->count != NULL){ + NCI_Free(req->count); + } + if (req->starts != NULL){ + NCI_Free(req->starts); + } + if (req->counts != NULL){ + NCI_Free(req->counts); + } + if (req->stride != NULL){ + NCI_Free(req->stride); + } + if (req->xbufs != NULL){ + NCI_Free(req->xbufs); + } + if (req->xbuf != req->buf){ + NCI_Free(req->xbuf); + } + + /* Return id to the list */ + lp->nused--; + lp->ids[lp->pos[reqid]] = lp->ids[lp->nused]; + lp->pos[lp->ids[lp->nused]] = lp->pos[reqid]; + lp->ids[lp->nused] = reqid; + lp->pos[reqid] = lp->nused; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_profile.m4 b/src/drivers/ncchunkio/ncchkioi_profile.m4 new file mode 100644 index 000000000..93f7a8157 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile.m4 @@ -0,0 +1,98 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2021, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')`'dnl +include(`foreach_idx.m4')`'dnl +include(`list_len.m4')`'dnl +include(`utils.m4')`'dnl +include(`ncchkioi_profile_timers.m4')`'dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')`'dnl +define(`CONCATE',`$1$2')`'dnl +changecom(`##', `')`'dnl +dnl +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "ncchkio_internal.h" +#include "ncchkioi_profile.h" + +/* + * Report performance profiling + */ +#ifdef PNETCDF_PROFILING + +static double tmax[NC_CHK_NTIMER], tmin[NC_CHK_NTIMER], tmean[NC_CHK_NTIMER], tvar[NC_CHK_NTIMER], tvar_local[NC_CHK_NTIMER]; + +const char * const tname[NC_CHK_NTIMER]={ +foreach(`t', NC_CHK_TIMERS, `"CONCATE(`nc_chk_timer_', t)", +')dnl +}; + +void ncchkioi_profile_add_time (NC_chk *ncchkp, int id, double t) { + assert (id >= 0 && id < NC_CHK_NTIMER); + ncchkp->profile.tt[id] += t; + ncchkp->profile.cnt[id]++; +} + +void ncchkioi_print_profile(NC_chk *ncchkp){ + int i; + + MPI_Reduce (ncchkp->profile.tt, tmax, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MAX, 0, ncchkp->comm); + MPI_Reduce (ncchkp->profile.tt, tmin, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MIN, 0, ncchkp->comm); + MPI_Allreduce (ncchkp->profile.tt, tmean, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, ncchkp->comm); + for (i = 0; i < NC_CHK_NTIMER; i++) { + tmean[i] /= ncchkp->np; + tvar_local[i] = (ncchkp->profile.tt[i] - tmean[i]) * (ncchkp->profile.tt[i] - tmean[i]); + } + MPI_Reduce (tvar_local, tvar, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, 0, ncchkp->comm); + + if (ncchkp->rank == 0) { + for (i = 0; i < NC_CHK_NTIMER; i++) { + printf ("#%%$: %s_time_mean: %lf\n", tname[i], tmean[i]); + printf ("#%%$: %s_time_max: %lf\n", tname[i], tmax[i]); + printf ("#%%$: %s_time_min: %lf\n", tname[i], tmin[i]); + printf ("#%%$: %s_time_var: %lf\n\n", tname[i], tvar[i]); + } + } + + MPI_Reduce (ncchkp->profile.cnt, tmax, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MAX, 0, ncchkp->comm); + MPI_Reduce (ncchkp->profile.cnt, tmin, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MIN, 0, ncchkp->comm); + MPI_Allreduce (ncchkp->profile.cnt, tmean, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, ncchkp->comm); + for (i = 0; i < NC_CHK_NTIMER; i++) { + tmean[i] /= ncchkp->np; + tvar_local[i] = (ncchkp->profile.cnt[i] - tmean[i]) * (ncchkp->profile.cnt[i] - tmean[i]); + } + MPI_Reduce (tvar_local, tvar, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, 0, ncchkp->comm); + + if (ncchkp->rank == 0) { + for (i = 0; i < NC_CHK_NTIMER; i++) { + printf ("#%%$: %s_count_mean: %lf\n", tname[i], tmean[i]); + printf ("#%%$: %s_count_max: %lf\n", tname[i], tmax[i]); + printf ("#%%$: %s_count_min: %lf\n", tname[i], tmin[i]); + printf ("#%%$: %s_count_var: %lf\n\n", tname[i], tvar[i]); + } + } +} +#endif + + + + diff --git a/src/drivers/ncchunkio/ncchkioi_profile.m4h b/src/drivers/ncchunkio/ncchkioi_profile.m4h new file mode 100644 index 000000000..a07fbb0f2 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile.m4h @@ -0,0 +1,75 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2021, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')`'dnl +include(`foreach_idx.m4')`'dnl +include(`list_len.m4')`'dnl +include(`utils.m4')`'dnl +include(`ncchkioi_profile_timers.m4')`'dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')`'dnl +define(`CONCATE',`$1$2')`'dnl +changecom(`##', `')`'dnl +dnl +#pragma once + +#ifdef HAVE_CONFIG_H +#include +#endif + +/* + * Report performance profiling + */ +#ifdef PNETCDF_PROFILING + +#define NC_CHK_NTIMER list_len(NC_CHK_TIMERS) + +foreach_idx(`t', `i', NC_CHK_TIMERS, `#define CONCATE(`NC_CHK_TIMER_', upcase(t)) i +')dnl + +#define NC_CHK_TIMER_START(A) ncchkp->profile.st[A] = MPI_Wtime(); +#define NC_CHK_TIMER_PAUSE(A) { \ + ncchkp->profile.tt[A] += MPI_Wtime() - ncchkp->profile.st[A]; \ +} +#define NC_CHK_TIMER_STOP(A) { \ + NC_CHK_TIMER_PAUSE(A) \ + ncchkp->profile.cnt[A] ++; \ +} +#define NC_CHK_TIMER_SWAP(A, B) { \ + double tmp = MPI_Wtime(); \ + ncchkp->profile.tt[A] += tmp - ncchkp->profile.st[A]; \ + ncchkp->profile.cnt[A] ++; \ + ncchkp->profile.st[B] = tmp; \ +} +#define NC_CHK_TIMER_STOPEX(A, B) { \ + double tmp = MPI_Wtime(); \ + ncchkp->profile.tt[A] += tmp - ncchkp->profile.st[A]; \ + ncchkp->profile.cnt[A] ++; \ + ncchkp->profile.tt[B] -= tmp - ncchkp->profile.st[A]; \ +} + +dnl +typedef struct NC_chk_timers { + /* Profiling information */ + double st[NC_CHK_NTIMER]; + double tt[NC_CHK_NTIMER]; + double cnt[NC_CHK_NTIMER]; +} NC_chk_timers; + +#else + +#define NC_CHK_TIMER_START(A) +#define NC_CHK_TIMER_STOP(A) +#define NC_CHK_TIMER_PAUSE(A) +#define NC_CHK_TIMER_SWAP(A, B) +#define NC_CHK_TIMER_STOPEX(A, B) + +#endif + diff --git a/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 b/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 new file mode 100644 index 000000000..27652837b --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 @@ -0,0 +1,73 @@ +define(`NC_CHK_TIMERS', `( `total', dnl + `var_init', dnl + `var_init_meta', dnl + `var_init_csize', dnl + `var_init_cown', dnl + `var_resize', dnl + `put', dnl + `put_cb', dnl + `put_cb_init', dnl + `put_cb_sync', dnl + `put_cb_pack_req', dnl + `put_cb_pack_rep', dnl + `put_cb_unpack_req', dnl + `put_cb_unpack_rep', dnl + `put_cb_send_req', dnl + `put_cb_send_rep', dnl + `put_cb_recv_req', dnl + `put_cb_recv_rep', dnl + `put_cb_self', dnl + `put_cb_barr', dnl + `put_bg', dnl + `put_bg_init', dnl + `put_bg_cache', dnl + `put_bg_rd', dnl + `put_bg_decom', dnl + `put_io', dnl + `put_io_init', dnl + `put_io_com', dnl + `put_io_sync', dnl + `put_io_wr', dnl + `put_io_barr', dnl + `get', dnl + `get_resize', dnl + `get_cb', dnl + `get_cb_init', dnl + `get_cb_sync', dnl + `get_cb_pack_req', dnl + `get_cb_pack_rep', dnl + `get_cb_unpack_req', dnl + `get_cb_unpack_rep', dnl + `get_cb_send_req', dnl + `get_cb_send_rep', dnl + `get_cb_recv_req', dnl + `get_cb_recv_rep', dnl + `get_cb_self', dnl + `get_cb_barr', dnl + `get_io', dnl + `get_io_init', dnl + `get_io_cache', dnl + `get_io_rd', dnl + `get_io_decom', dnl + `get_convert', dnl + `finalize', dnl + `finalize_meta', dnl + `iput', dnl + `iget', dnl + `wait', dnl + `wait_put', dnl + `wait_put_barr', dnl + `wait_get', dnl + `put_size', dnl + `get_size', dnl + `send_size', dnl + `recv_size', dnl + `nsend', dnl + `nrecv', dnl + `nremote', dnl + `nreq', dnl + `nlocal', dnl + `nchunk', dnl + `var_size', dnl + `var_zsize', dnl +)')`'dnl \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_put_var.c b/src/drivers/ncchunkio/ncchkioi_put_var.c new file mode 100644 index 000000000..528736bb3 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_put_var.c @@ -0,0 +1,766 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_put_var_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j; + int cid; // Chunk iterator + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tsizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + int max_tbuf = 0; // Size of intermediate buffer + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + wcnt_all = wcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tsize = tstart + varp->ndim; + tssize = tsize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + nsend++; + wcnt_local[cid] = 1; + } else { + // We mark covered chunk of our own to prevent unnecessary calculation of overlap + // -1 is purely a mark, we need to add 1 back to global message count + wcnt_local[cid] = -1; + max_tbuf = varp->chunksize; + } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + // Allocate buffer for sending + sbufs = (char **)NCI_Malloc (sizeof (char *) * nsend); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Calculate number of recv request + // This is for all the chunks + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += wcnt_all[cid] - wcnt_local[cid]; + } + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nrecv); + rbufs = (char **)NCI_Malloc (sizeof (char *) * nrecv); + rsizes = (int *)NCI_Malloc (sizeof (int) * nrecv); + + // Post send + nsend = 0; + // Iterate through chunks + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // We got something to send if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Calculate chunk overlap + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + // Allocate buffer + sbufs[nsend] = (char *)NCI_Malloc (overlapsize + sizeof (int) * varp->ndim * 2); + + // Metadata + packoff = 0; + tstartp = (int *)sbufs[nsend]; + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + // Pack type + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (buf, 1, ptype, sbufs[nsend], packoff + overlapsize, &packoff, MPI_COMM_SELF); + + MPI_Type_free (&ptype); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Send the request + CHK_ERR_ISEND (sbufs[nsend], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + nsend); + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_CB_SEND_REQ, NC_CHK_TIMER_PUT_CB_PACK_REQ) + nsend++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + nrecv = 0; + for (j = 0; j < varp->nmychunk; j++) { + cid = varp->mychunks[j]; + // We are the owner of the chunk + // Receive data from other process + for (i = 0; i < wcnt_all[cid] - wcnt_local[cid]; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + nrecv); + + // Allocate buffer + rbufs[nrecv] = (char *)NCI_Malloc (rsizes[nrecv]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[nrecv], rsizes[nrecv], MPI_BYTE, &rmsg, rreqs + nrecv); + + nrecv++; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Wait for all send + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] && varp->chunk_cache[cid] == NULL) { + if (varp->chunk_index[cid].len > 0) { nread++; } + } + } + rids = (int *)NCI_Malloc (sizeof (int) * nread); + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] || wcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + // Allocate intermediate buffer + if (max_tbuf > 0) { tbuf = (char *)NCI_Malloc (max_tbuf); } + + // For each chunk we own, we need to receive incoming data + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data first if we have any + if (wcnt_local[cid] < 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + overlapsize = get_chunk_overlap (varp, citr, start, count, ostart, osize); + + if (overlapsize > 0) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (buf, 1, ptype, tbuf, varp->chunksize, &packoff, MPI_COMM_SELF); + overlapsize = packoff; + + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Now, it is time to process data from other processes + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in wcnt_local[cid] + CHK_ERR_WAITALL (wcnt_all[cid] - wcnt_local[cid], rreqs + nrecv, rstats + nrecv); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + // Process data received + for (j = nrecv; j < nrecv + wcnt_all[cid] - wcnt_local[cid]; j++) { + // Metadata + packoff = 0; + tstartp = (int *)rbufs[j]; + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tsizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_UNPACK (rbufs[j], rsizes[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + nrecv += wcnt_all[cid] - wcnt_local[cid]; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreqs); + NCI_Free (sstats); + for (i = 0; i < nsend; i++) { NCI_Free (sbufs[i]); } + NCI_Free (sbufs); + + NCI_Free (rreqs); + NCI_Free (rstats); + for (i = 0; i < nrecv; i++) { NCI_Free (rbufs[i]); } + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + if (rids != NULL) { NCI_Free (rids); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_var_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + + MPI_Offset *ostart = NULL, *osize; + int *tsize, *tssize, *tstart = NULL, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local = NULL, *wcnt_all; // Number of processes that writes to each chunk + int wrange_local[2], wrange_all[2]; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq = NULL; // Send and recv req + MPI_Status *sstat = NULL, rstat; // Send and recv status + char **sbuf = NULL, **sbufp, **rbuf = NULL, **rbufp; // Send and recv buffer + int *rsize = NULL, *ssize = NULL; // recv size of each message + int *sdst; // recv size of each message + int *smap; + size_t bsize; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + CHK_PTR (wcnt_local) + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + CHK_PTR (tstart) + tssize = tstart + varp->ndim; + tsize = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR (ostart) + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + wrange_local[0] = varp->nchunk; + wrange_local[1] = 0; + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (wrange_local[0] > cid) { wrange_local[0] = cid; } + if (wrange_local[1] < cid) { wrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk and access range + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + wrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (wrange_local, wrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request from self + wrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + CHK_PTR (sbuf) + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + CHK_PTR (ssize) + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + CHK_PTR (rbuf) + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + CHK_PTR (rreq) + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 1); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + // Allocate buffer for send + bsize = 0; + for (i = 0; i < nsend; i++) { bsize += ssize[i]; } + if (nsend > 0) { + sbuf[0] = sbufp[0] = (char *)NCI_Malloc (bsize); + CHK_PTR (sbuf[0]) + for (i = 1; i < nsend; i++) + sbuf[i] = sbufp[i] = sbuf[i-1] + ssize[i-1]; + } + + // Pack requests + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type + for (i = 0; i < varp->ndim; i++) { + tstart[i] = (int)(ostart[i] - start[i]); + tsize[i] = (int)count[i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + int outsize = ssize[j] - sizeof(int) * (varp->ndim * 2 + 1); + CHK_ERR_PACK (buf, 1, ptype, sbufp[j], outsize, &packoff, MPI_COMM_SELF); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < wrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= wrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + err = ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + CHK_PTR (tbuf) + + // Handle our own data + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (buf, 1, ptype, tbuf, varp->chunksize, &packoff, MPI_COMM_SELF); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + while (rbufp[j] < rbuf[j] + rsize[j]) { + // Metadata + cid = *(int *)(rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + +err_out:; + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + if (nsend > 0) { NCI_Free (sbuf[0]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rids); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + + return err; +} + +int ncchkioi_put_var (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + if (varp->dimsize[0] < ncchkp->recsize) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + err = ncchkioi_put_var_cb_chunk (ncchkp, varp, start, count, stride, buf); + break; + case NC_CHK_COMM_PROC: + err = ncchkioi_put_var_cb_proc (ncchkp, varp, start, count, stride, buf); + break; + } + CHK_ERR + + // Write the compressed variable + err = ncchkioi_save_var (ncchkp, varp); + CHK_ERR + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_put_varn.c b/src/drivers/ncchunkio/ncchkioi_put_varn.c new file mode 100644 index 000000000..94bb60887 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_put_varn.c @@ -0,0 +1,810 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_put_varn_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + MPI_Offset *const *strides, + void **bufs) { + int err=NC_NOERR; + int i, j; + int cid, req; // Chunk and request iterator + + int *tsize, *tssize, *tstart, *tsizep, *tstartp; // Size for sub-array type + MPI_Offset *ostart, *osize; + MPI_Offset *citr; + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + int max_tbuf; // Size of intermediate buffer + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + wcnt_all = wcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tsize = tstart + varp->ndim; + tssize = tsize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + max_tbuf = 0; + for (req = 0; req < nreq; req++) { + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + + // Iterate through chunks + do { + // Calculate overlapping + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + if (wcnt_local[cid] == 0) { nsend++; } + wcnt_local[cid] += overlapsize + sizeof (int) * 2 * varp->ndim; + } else { + // We mark covered chunk of our own to prevent unnecessary calculation of overlap + // -1 is purely a mark, we need to add 1 back to global message count + wcnt_local[cid] = -1; + + // Record max overlapsize so we know how large the intermediate buffer is needed + // later + if (max_tbuf < overlapsize) { max_tbuf = overlapsize; } + } + + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + // Allocate buffer for sending + sbufs = (char **)NCI_Malloc (sizeof (char *) * nsend); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + j = 0; + // Allocate buffer for data + for (cid = 0; cid < varp->nchunk; cid++) { + // Count number of mnessage we need to send + if (wcnt_local[cid] > 0) { + // Add space for number of reqs + sbufs[j++] = (char *)NCI_Malloc (wcnt_local[cid]); + // We don't need message size anymore, wcnt_local is used to track number of message + // from now on + wcnt_local[cid] = 1; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + + // Calculate number of recv request + // This is for all the chunks + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += wcnt_all[cid] - wcnt_local[cid]; + } + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nrecv); + rbufs = (char **)NCI_Malloc (sizeof (char *) * nrecv); + rsizes = (int *)NCI_Malloc (sizeof (int) * nrecv); + + // Post send and recv + nrecv = 0; + nsend = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (varp->chunk_owner[cid] == ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // We are the owner of the chunk + // Receive data from other process + for (i = 0; i < wcnt_all[cid] - wcnt_local[cid]; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + nrecv); + + // Allocate buffer + rbufs[nrecv] = (char *)NCI_Malloc (rsizes[nrecv]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[nrecv], rsizes[nrecv], MPI_BYTE, &rmsg, rreqs + nrecv); + nrecv++; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + } else { + // If we any of our request overlap with this chunk, we need to send data + // We send only 1 message for 1 chunk + if (wcnt_local[cid] > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + packoff = 0; + // Get chunk iterator + get_chunk_itr (varp, cid, citr); + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + // If current request have any overlap with the chunk, we pack the data and + // metadata + if (overlapsize > 0) { + // Metadata + tstartp = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + // Pack type + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (bufs[req], 1, ptype, sbufs[nsend], packoff + overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Send the request + CHK_ERR_ISEND (sbufs[nsend], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + nsend); + nsend++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + } + } + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Wait for all send + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] && varp->chunk_cache[cid] == NULL) { + if (varp->chunk_index[cid].len > 0) { nread++; } + } + } + rids = (int *)NCI_Malloc (sizeof (int) * nread); + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] || wcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + // Allocate intermediate buffer + if (max_tbuf > 0) { tbuf = (char *)NCI_Malloc (max_tbuf); } + + // For each chunk we own, we need to receive incoming data + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data first if we have any + if (wcnt_local[cid] < 0) { + for (req = 0; req < nreq; req++) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + // If anything overlaps + if (overlapsize > 0) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, tbuf, overlapsize, &packoff, ncchkp->comm); + + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Now, it is time to process data from other processes + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in wcnt_local[cid] + CHK_ERR_WAITALL (wcnt_all[cid] - wcnt_local[cid], rreqs + nrecv, rstats + nrecv); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + // Process data received + for (j = nrecv; j < nrecv + wcnt_all[cid] - wcnt_local[cid]; j++) { + packoff = 0; + while (packoff < rsizes[j]) { + // Metadata + tstartp = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + // Packtype + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tsizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_UNPACK (rbufs[j], rsizes[j], &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + nrecv += wcnt_all[cid] - wcnt_local[cid]; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreqs); + NCI_Free (sstats); + for (i = 0; i < nsend; i++) { NCI_Free (sbufs[i]); } + NCI_Free (sbufs); + + NCI_Free (rreqs); + NCI_Free (rstats); + for (i = 0; i < nrecv; i++) { NCI_Free (rbufs[i]); } + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + if (rids != NULL) { NCI_Free (rids); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_varn_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void **bufs) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator and owner + int req; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + int wrange_local[2], wrange_all[2]; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq; // Send and recv req + MPI_Status *sstat, rstat; // Send and recv status + char **sbuf, **sbufp, **rbuf, **rbufp; // Send and recv buffer + int *rsize, *ssize; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tstart + varp->ndim; + tsize = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each processes + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + wrange_local[0] = varp->nchunk; + wrange_local[1] = 0; + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (wrange_local[0] > cid) { wrange_local[0] = cid; } + if (wrange_local[1] < cid) { wrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + wrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (wrange_local, wrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request form self + wrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 1); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + // Allocate buffer for send + for (i = 0; i < nsend; i++) { sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); } + + // Pack requests + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type + for (i = 0; i < varp->ndim; i++) { + tstart[i] = (int)(ostart[i] - starts[req][i]); + tsize[i] = (int)counts[req][i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, sbufp[j], ssize[j], &packoff, ncchkp->comm); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < wrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= wrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, tbuf, varp->chunksize, &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + while (rbufp[j] < rbuf[j] + rsize[j]) { + // Metadata + cid = *(int *)(rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (sbuf[i]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rids); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_varn (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf) { + int err=NC_NOERR; + int i, j; + MPI_Offset rsize; + char *bptr = (char *)buf; + char **bufs; + + if (varp->isrec) { + for (i = 0; i < nreq; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + if (varp->dimsize[0] < ncchkp->recsize) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + ncchkioi_var_resize (ncchkp, varp); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + } + + // Calculate buffer offset of each request + bufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + for (i = 0; i < nreq; i++) { + bufs[i] = bptr; + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + bptr += rsize; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + ncchkioi_put_varn_cb_chunk (ncchkp, varp, nreq, starts, counts, NULL, (void **)bufs); + break; + case NC_CHK_COMM_PROC: + ncchkioi_put_varn_cb_proc (ncchkp, varp, nreq, starts, counts, (void **)bufs); + break; + } + + // Write the compressed variable + ncchkioi_save_var (ncchkp, varp); + +err_out:; + NCI_Free (bufs); + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_util.c b/src/drivers/ncchunkio/ncchkioi_util.c new file mode 100644 index 000000000..30d0ebff1 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_util.c @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* return internal size for values of specified netCDF type */ +MPI_Offset NC_Type_size (nc_type type) { /* netCDF type code */ + switch (type) { + case NC_BYTE: + return sizeof (char); + case NC_CHAR: + return sizeof (char); + case NC_SHORT: + return sizeof (short); + case NC_INT: + return sizeof (int); + case NC_FLOAT: + return sizeof (float); + case NC_DOUBLE: + return sizeof (double); + case NC_UBYTE: + return sizeof (unsigned char); + case NC_USHORT: + return sizeof (unsigned short); + case NC_UINT: + return sizeof (unsigned int); + case NC_INT64: + return sizeof (long long); + case NC_UINT64: + return sizeof (unsigned long long); + default: + + return 0; + } +} + +/* + * Convert NC type to MPI type + */ +MPI_Datatype ncchkioi_nc_to_mpi_type (nc_type atype) { + switch (atype) { + case NC_BYTE: + return MPI_BYTE; + case NC_CHAR: + return MPI_CHAR; + case NC_SHORT: + return MPI_SHORT; + case NC_INT: + return MPI_INT; + case NC_FLOAT: + return MPI_FLOAT; + case NC_DOUBLE: + return MPI_DOUBLE; + } + + return NC_NAT; +} + +/* + * Extract mpi hints and set up the flags + */ +int ncchkioi_extract_hint (NC_chk *ncchkp, MPI_Info info) { + int flag; + char value[MPI_MAX_INFO_VAL]; + + // Block assignment + MPI_Info_get (info, "nc_chk_block_mapping", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "static") == 0) { + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } else { + printf ("Warning: Unknown mapping %s, using static\n", value); + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } + } else { + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } + + // Messaging unit + MPI_Info_get (info, "nc_chk_comm_unit", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "chunk") == 0) { + ncchkp->comm_unit = NC_CHK_COMM_CHUNK; + } else if (strcmp (value, "proc") == 0) { + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } else { + printf ("Warning: Unknown messaging unit %s, using proc\n", value); + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } + } else { + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } + + // Delay init + ncchkp->delay_init = 0; + MPI_Info_get (info, "nc_chk_delay_init", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "1") == 0) { ncchkp->delay_init = 1; } + } + + // Exact chunk owner assignment + ncchkp->exact_cown = 0; + MPI_Info_get (info, "nc_chk_exact_cown", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "1") == 0) { ncchkp->exact_cown = 1; } + } + + // Additional reserved space in file header + ncchkp->hdr_reserve = 1048576; // 1 MiB default + MPI_Info_get (info, "nc_chk_hdr_reserve", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->hdr_reserve = atoi (value); } + + // Reserve space for records + ncchkp->default_recnalloc = NC_CHK_DEFAULT_REC_ALLOC; + MPI_Info_get (info, "nc_chk_nrec", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->default_recnalloc = atoi (value); } + + ncchkp->default_recnalloc = NC_CHK_DEFAULT_REC_ALLOC; + MPI_Info_get (info, "nc_chk_nrec", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->default_recnalloc = atoi (value); } + + // Default filter + ncchkp->default_filter = NC_CHK_FILTER_NONE; + MPI_Info_get (info, "nc_chunk_default_filter", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "none") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_NONE; + } else if (strcmp (value, "dummy") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_DUMMY; + } else if (strcmp (value, "zlib") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_ZLIB; + } else if (strcmp (value, "sz") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_SZ; + } else { + if (ncchkp->rank == 0) { printf ("Warning: Unknown filter %s, use none\n", value); } + } + } + + // Buffer size + ncchkp->cache_limit = 0; // Unlimited + ncchkp->cache_limit_hint = 0; + MPI_Info_get (info, "nc_chk_buffer_size", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + sscanf (value, "%zd", &(ncchkp->cache_limit_hint)); + + if (ncchkp->cache_limit_hint > 0) { ncchkp->cache_limit = ncchkp->cache_limit_hint; } + } + + // Chunk owning size penalty + ncchkp->cown_ratio = 0.1; + MPI_Info_get (info, "nc_chk_cown_ratio", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->cown_ratio = atof (value); } + + return NC_NOERR; +} + +/* + * Export hint based on flag + * NOTE: We only set up the hint if it is not the default setting + * user hint maching the default behavior will be ignored + */ +int ncchkioi_export_hint (NC_chk *ncchkp, MPI_Info info) { + char value[MPI_MAX_INFO_VAL]; + + MPI_Info_set (info, "nc_compression", "enable"); + + switch (ncchkp->blockmapping) { + case NC_CHK_MAPPING_STATIC: + MPI_Info_set (info, "nc_chk_block_mapping", "static"); + break; + } + + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + MPI_Info_set (info, "nc_chk_comm_unit", "chunk"); + break; + case NC_CHK_COMM_PROC: + MPI_Info_set (info, "nc_chk_comm_unit", "proc"); + break; + } + + // Delay inint + if (ncchkp->delay_init) { + MPI_Info_set (info, "nc_chk_delay_init", "1"); + } else { + MPI_Info_set (info, "nc_chk_delay_init", "0"); + } + + // Exact cown + if (ncchkp->exact_cown) { + MPI_Info_set (info, "nc_chk_exact_cown", "1"); + } else { + MPI_Info_set (info, "nc_chk_exact_cown", "0"); + } + + // Additional reserved space in file header + sprintf (value, "%zd", ncchkp->hdr_reserve); + MPI_Info_set (info, "nc_chk_hdr_reserve", value); + + // Reserve space for records + sprintf (value, "%lld", ncchkp->default_recnalloc); + MPI_Info_set (info, "nc_chk_nrec", value); + + // Zip driver + switch (ncchkp->default_filter) { + case NC_CHK_FILTER_NONE: + MPI_Info_set (info, "nc_chk_driver", "none"); + break; + case NC_CHK_FILTER_DUMMY: + MPI_Info_set (info, "nc_chk_driver", "dummy"); + break; + case NC_CHK_FILTER_ZLIB: + MPI_Info_set (info, "nc_chk_driver", "zlib"); + break; + case NC_CHK_FILTER_SZ: + MPI_Info_set (info, "nc_chk_driver", "sz"); + break; + } + + // Buffer size + sprintf (value, "%zd", ncchkp->cache_limit); + MPI_Info_set (info, "nc_chk_buffer_size", value); + + return NC_NOERR; +} + +int ncchkioi_print_buffer_int (char *prefix, int *buf, int len) { + int i; + int rank, np; + int plen, rlen; + char *out, *outp; + char rankstr[16]; + + MPI_Comm_size (MPI_COMM_WORLD, &np); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + + rlen = sprintf (rankstr, "Rank %d: ", rank); + + plen = strlen (prefix); + out = outp = (char *)NCI_Malloc (len * 12 + 2 + plen + rlen); + + rlen = sprintf (outp, "%s ", rankstr); + outp += rlen; + plen = sprintf (outp, "%s ", prefix); + outp += plen; + for (i = 0; i < len; i++) { + plen = sprintf (outp, "%d ", buf[i]); + outp += plen; + } + + printf ("%s\n", out); + fflush (stdout); + + NCI_Free (out); + + return NC_NOERR; +} + +int ncchkioi_print_buffer_int64 (char *prefix, long long *buf, int len) { + int i; + int rank, np; + int plen, rlen; + char *out, *outp; + char rankstr[16]; + + MPI_Comm_size (MPI_COMM_WORLD, &np); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + + rlen = sprintf (rankstr, "Rank %d: ", rank); + + plen = strlen (prefix); + out = outp = (char *)NCI_Malloc (len * 18 + 2 + plen + rlen); + + rlen = sprintf (outp, "%s ", rankstr); + outp += rlen; + plen = sprintf (outp, "%s ", prefix); + outp += plen; + for (i = 0; i < len; i++) { + plen = sprintf (outp, "%lld ", buf[i]); + outp += plen; + } + + printf ("%s\n", out); + fflush (stdout); + + NCI_Free (out); + + return NC_NOERR; +} +#define NCCHKIOISWAP(V0, V1) \ + fdisps[V0] ^= fdisps[V1]; \ + fdisps[V1] ^= fdisps[V0]; \ + fdisps[V0] ^= fdisps[V1]; \ + mdisps[V0] ^= mdisps[V1]; \ + mdisps[V1] ^= mdisps[V0]; \ + mdisps[V0] ^= mdisps[V1]; \ + lens[V0] ^= lens[V1]; \ + lens[V1] ^= lens[V0]; \ + lens[V0] ^= lens[V1]; + +void ncchkioi_sort_file_offset (int len, MPI_Aint *fdisps, MPI_Aint *mdisps, int *lens) { + int i, j, p; + + if (len < 16) { + j = 1; + while (j) { + j = 0; + for (i = 0; i < len - 1; i++) { + if (fdisps[i] > fdisps[i + 1]) { + NCCHKIOISWAP (i, i + 1); + j = 1; + } + } + } + } else { + j = len / 2; + p = len - 1; + NCCHKIOISWAP (j, p); + + for (i = j = 0; i < len; i++) { + if (fdisps[i] < fdisps[p]) { + if (i != j) { NCCHKIOISWAP (i, j); } + j++; + } + } + + NCCHKIOISWAP (p, j); + + ncchkioi_sort_file_offset (j, fdisps, mdisps, lens); + ncchkioi_sort_file_offset (len - j - 1, fdisps + j + 1, mdisps + j + 1, lens + j + 1); + } +} + +int ncchkioi_subarray_off_len ( + int ndim, int *tsize, int *tssize, int *tstart, MPI_Offset *off, int *len) { + int err=NC_NOERR; + int i; + + // Try single row + err = 0; + for (i = 0; i < ndim - 1; i++) { + if (tssize[i] != 1) { + err = -1; + break; + } + } + if (err) { + // Try contiguous block + err = 0; + for (i = 1; i < ndim; i++) { + if (tssize[i] < tsize[i]) { + err = -1; + break; + } + } + if (!err) { + *len = 1; + for (i = 0; i < ndim; i++) { (*len) *= tssize[i]; } + } + } else { + *len = tssize[ndim - 1]; + } + + if (!err) { + *off = 0; + for (i = 0; i < ndim; i++) { (*off) = (*off) * tsize[i] + tstart[i]; } + } + + return err; +} + +#ifdef PNETCDF_PROFILING +int ncchkioi_update_statistics (NC_chk *ncchkp) { + int i, j; + int cid; + NC_chk_var *varp; + + ncchkp->var_size_sum = ncchkp->var_zsize_sum = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + for (j = 0; j < varp->nmychunk; j++) { + cid = varp->mychunks[j]; + ncchkp->var_zsize_sum += varp->chunk_index[cid].len; + } + ncchkp->var_size_sum += varp->nmychunk * varp->chunksize; + } + } + + return NC_NOERR; +} +#endif + +int ncchkioi_get_default_chunk_dim (NC_chk *ncchkp) { + int err = NC_NOERR, ret; + int i; + int ndim, dimid; + int len; + char *cur, *pre; + char name[1024]; + char *env = getenv ("PNETCDF_DEFAULT_CHUNK_DIM"); + + if (env != NULL) { + err = ncchkp->driver->inq (ncchkp->ncp, &ndim, NULL, NULL, NULL); + if (err != NC_NOERR) return err; + + if (ndim > ncchkp->ndim) { + ncchkp->chunkdim = NCI_Realloc (ncchkp->chunkdim, ndim * sizeof (int)); + for (i = ncchkp->ndim; i < ndim; i++) { ncchkp->chunkdim[i] = 0; } + ncchkp->ndim = ndim; + } + + cur = pre = env; + for (cur = pre = env; (*cur) != '\0'; cur++) { + if ((*cur) == ';') { + if (sscanf (pre, "%s : %d ;", name, &len) == 2) { + if (len > 0) { + ret = ncchkp->driver->inq_dimid (ncchkp->ncp, name, &dimid); + if (ret == NC_NOERR) { ncchkp->chunkdim[dimid] = len; } + } + } + pre = cur + 1; + } + } + } + + return NC_NOERR; +} + +/* in-place byte swap */ +void ncchkioi_idx_in_swapn (NC_chk_chunk_index_entry *idx, MPI_Offset nelems) { + NC_chk_chunk_index_entry *bufp; + + for (bufp = idx; bufp < idx + nelems; bufp++) { + bufp->off = ((bufp->off & 0x00000000000000FFULL) << 56) | + ((bufp->off & 0x000000000000FF00ULL) << 40) | + ((bufp->off & 0x0000000000FF0000ULL) << 24) | + ((bufp->off & 0x00000000FF000000ULL) << 8) | + ((bufp->off & 0x000000FF00000000ULL) >> 8) | + ((bufp->off & 0x0000FF0000000000ULL) >> 24) | + ((bufp->off & 0x00FF000000000000ULL) >> 40) | + ((bufp->off & 0xFF00000000000000ULL) >> 56); + bufp->len = ((bufp->len) << 24) | (((bufp->len) & 0x0000ff00) << 8) | + (((bufp->len) & 0x00ff0000) >> 8) | (((bufp->len) >> 24)); + } +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_init.c b/src/drivers/ncchunkio/ncchkioi_var_init.c new file mode 100644 index 000000000..fc7e239bf --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_init.c @@ -0,0 +1,577 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_var_init_core ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + int ret; + int i, j; + int valid; + MPI_Offset len; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->chunkdim == NULL) { // This is a new uninitialized variable + // Init value + varp->mychunks = NULL; // To be added later + + // Update dimsize on rec dim + if (ncchkp->recdim >= 0) { + if (varp->dimsize[0] < ncchkp->recsize) { varp->dimsize[0] = ncchkp->recsize; } + } + + // Determine its block size + varp->chunkdim = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + varp->nchunks = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + + // First check attribute + valid = 1; + ret = ncchkp->driver->inq_att (ncchkp->ncp, varp->varid, "_chunkdim", NULL, &len); + if (ret == NC_NOERR && len == varp->ndim) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_chunkdim", + varp->chunkdim, MPI_INT); + if (ret != NC_NOERR) { valid = 0; } + // chunkdim must be at leasst 1 + for (j = 0; j < varp->ndim; j++) { + if (varp->chunkdim[j] <= 0) { + valid = 0; + printf ("Warning: chunk size invalid, use default"); + break; + } + } + } else { + valid = 0; + } + + // Now, try global default + if ((!valid) && ncchkp->chunkdim) { + valid = 1; + for (i = 0; i < varp->ndim; i++) { + if (ncchkp->chunkdim[varp->dimids[i]] > 0) { + varp->chunkdim[i] = ncchkp->chunkdim[varp->dimids[i]]; + } else { + valid = 0; + break; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + + // Still no clue, try to infer form I/O pattern (expensive) + // If there is no I/O records, the default is just set to entire variable (only 1 chunk) + if (!valid) { + // Infering not supported + err = ncchkioi_calc_chunk_size (ncchkp, varp, nreq, starts, counts); + CHK_ERR + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + // Calculate total # chunks, # chunks along each dim, chunksize + varp->nchunkrec = 1; + varp->chunksize = NC_Type_size (varp->xtype); + for (i = 0; i < varp->ndim; i++) { // chunkdim must be at leasst 1 + if (varp->dimsize[i] % varp->chunkdim[i] == 0) { + varp->nchunks[i] = (int)(varp->dimsize[i] / (MPI_Offset)varp->chunkdim[i]); + } else { + varp->nchunks[i] = (int)(varp->dimsize[i] / (MPI_Offset)varp->chunkdim[i] + 1); + } + if (i > 0) { varp->nchunkrec *= varp->nchunks[i]; } + varp->chunksize *= varp->chunkdim[i]; + } + if (varp->isrec) { + varp->nrec = varp->nchunks[0]; + varp->nrecalloc = ncchkp->default_recnalloc; + while (varp->nrecalloc < varp->nchunks[0]) { + varp->nrecalloc *= NC_CHK_REC_MULTIPLIER; + } + } else { + varp->nrec = 1; + varp->nrecalloc = 1; + varp->nchunkrec *= varp->nchunks[0]; + } + varp->nchunk = varp->nchunkrec * varp->nrec; + varp->nchunkalloc = varp->nrecalloc * varp->nchunkrec; + + // Calculate number of chunks below each dimension + varp->cidsteps = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + varp->cidsteps[varp->ndim - 1] = 1; + for (i = varp->ndim - 2; i >= 0; i--) { + varp->cidsteps[i] = varp->cidsteps[i + 1] * varp->nchunks[i + 1]; + } + + // Determine block ownership + varp->dirty = (int *)NCI_Malloc (sizeof (int) * varp->nchunkalloc); + varp->chunk_cache = (NC_chk_cache **)NCI_Malloc (sizeof (char *) * varp->nchunkalloc); + memset (varp->chunk_cache, 0, sizeof (char *) * varp->nchunkalloc); + memset (varp->dirty, 0, sizeof (int) * varp->nchunkalloc); + + // Block ownership to be decisded later + varp->chunk_owner = (int *)NCI_Malloc (sizeof (int) * varp->nchunkalloc); + + // Determine block offset + varp->chunk_index = (NC_chk_chunk_index_entry *)NCI_Malloc ( + sizeof (NC_chk_chunk_index_entry) * (varp->nchunkalloc + 1)); + + // Try if there are offset recorded in attributes, it can happen after opening a file + if (varp->isnew) { + varp->metaoff = -1; + ; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + + /* Select compression driver based on attribute */ + ret = ncchkp->driver->inq_att (ncchkp->ncp, varp->varid, "_filter", NULL, &len); + if (ret == NC_NOERR && len == 1) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_filter", + &(varp->filter), MPI_INT); + if (ret != NC_NOERR) { return err; } + } else { + varp->filter = ncchkp->default_filter; + } + switch (varp->filter) { + case NC_CHK_FILTER_NONE: + varp->filter_driver = NULL; + break; + case NC_CHK_FILTER_DUMMY: + varp->filter_driver = ncchk_dummy_inq_driver (); + break; +#ifdef ENABLE_ZLIB + case NC_CHK_FILTER_ZLIB: + varp->filter_driver = ncchk_zlib_inq_driver (); + break; +#endif +#ifdef ENABLE_SZ + case NC_CHK_FILTER_SZ: + varp->filter_driver = ncchk_sz_inq_driver (); + break; +#endif + default: + if (ncchkp->rank == 0) { + printf ("Warning: Unknown filter driver id %d, use NC_CHK_FILTER_DUMMY\n", + varp->filter); + } + varp->filter_driver = ncchk_dummy_inq_driver (); + break; + break; + } + + // Update max ndim and chunksize + if (ncchkp->max_ndim < varp->ndim) { ncchkp->max_ndim = varp->ndim; } + if (ncchkp->max_chunk_size < varp->chunksize) { + ncchkp->max_chunk_size = varp->chunksize; + } + + if (ncchkp->cache_limit_hint == -1) { + ncchkp->cache_limit += (size_t) (varp->nmychunkrec) * (size_t) (varp->chunksize); + } + } + } + +err_out:; + return err; +} + +int ncchkioi_var_init ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + + err = ncchkioi_var_init_core (ncchkp, varp, nreq, starts, counts); + CHK_ERR + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkioi_calc_chunk_owner (ncchkp, varp, nreq, starts, counts); + CHK_ERR + } + +err_out:; + return err; +} + +void ncchkioi_var_free (NC_chk_var *varp) { + + if (varp->chunkdim != NULL) { + NCI_Free (varp->dimsize); + NCI_Free (varp->chunkdim); + NCI_Free (varp->dimids); + NCI_Free (varp->nchunks); + NCI_Free (varp->cidsteps); + NCI_Free (varp->chunk_index); + NCI_Free (varp->chunk_owner); + NCI_Free (varp->dirty); + // for(i = 0; i < varp->nmychunk; i++){ + // if (varp->chunk_cache[varp->mychunks[i]] != NULL){ + // NCI_Free(varp->chunk_cache[varp->mychunks[i]]); + // } + //} + NCI_Free (varp->chunk_cache); + NCI_Free (varp->mychunks); + } +} + +int ncchkioi_init_nvar_core_gather (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts) { + int err=NC_NOERR; + int i, j; + NC_chk_var *varp; + ncchkioi_chunk_overlap_t *ocnt[2], *ocnt_all[2]; + size_t ocnt_size[2]; + MPI_Status stat; + MPI_Request req; + + // Iinit vars + ocnt_size[0] = ocnt_size[1] = 0; + ocnt[0] = ocnt[1] = NULL; + for (i = 0; i < nvar; i++) { + varp = varps[i]; + j = i & 1; + + err = ncchkioi_var_init_core (ncchkp, varp, rcnt[i], starts + roff[i], counts + roff[i]); + CHK_ERR + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->nchunkrec > ocnt_size[j]) { + ocnt_size[j] = varp->nchunkrec; + NCI_Free (ocnt[j]); + ocnt[j] = (ncchkioi_chunk_overlap_t *)NCI_Malloc ( + sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec * 2); + ocnt_all[j] = ocnt[j] + varp->nchunkrec; + } + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, rcnt[i], starts, counts, ocnt[j]); + CHK_ERR + } + + if ((i > 0) && (req != MPI_REQUEST_NULL)) { // Wait comm for prev var + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varps[i - 1], ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varps[i - 1], ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt[j], ocnt_all[j], &req); + CHK_ERR + } else { + req = MPI_REQUEST_NULL; + } + } + // Last var + if (req != MPI_REQUEST_NULL) { + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + +err_out:; + NCI_Free (ocnt[0]); + NCI_Free (ocnt[1]); + return err; +} + +int ncchkioi_init_nvar_core_reduce (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts) { + int err=NC_NOERR; + int i, j; + NC_chk_var *varp; + ncchkioi_chunk_overlap_t *ocnt[2], *ocnt_all[2]; + size_t ocnt_size[2]; + MPI_Status stat; + MPI_Request req; + + // Iinit vars + ocnt_size[0] = ocnt_size[1] = 0; + ocnt[0] = ocnt[1] = NULL; + for (i = 0; i < nvar; i++) { + varp = varps[i]; + j = i & 1; + + err = ncchkioi_var_init_core (ncchkp, varp, rcnt[i], starts + roff[i], counts + roff[i]); + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->nchunkrec > ocnt_size[j]) { + ocnt_size[j] = varp->nchunkrec; + NCI_Free (ocnt[j]); + ocnt[j] = (ncchkioi_chunk_overlap_t *)NCI_Malloc ( + sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec * 2); + ocnt_all[j] = ocnt[j] + varp->nchunkrec; + } + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, rcnt[i], starts + roff[i], + counts + roff[i], ocnt[j]); + CHK_ERR + } + + if ((i > 0) && (req != MPI_REQUEST_NULL)) { // Wait comm for prev var + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varps[i - 1], ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varps[i - 1], ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt[j], ocnt_all[j], &req); + } else { + req = MPI_REQUEST_NULL; + } + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_VAR_INIT_COWN, NC_CHK_TIMER_VAR_INIT_META) + } + // Last var + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + if (req != MPI_REQUEST_NULL) { + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_VAR_INIT_COWN, NC_CHK_TIMER_VAR_INIT_META) + +err_out:; + NCI_Free (ocnt[0]); + NCI_Free (ocnt[1]); + return err; +} + +int ncchkioi_init_nvar (NC_chk *ncchkp, int nput, int *putreqs, int nget, int *getreqs) { + int err = NC_NOERR, ret; + int i, j; + int nflag; + unsigned int *flag, *flag_all; + int nvar; + int *vmap; + NC_chk_var *varp; + NC_chk_var **varps; + int *rcnt, *roff; + MPI_Offset **starts, **counts; + NC_chk_req *req; + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + CHK_PTR (flag) + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + if ((ncchkp->vars.data + i)->chunkdim == NULL) { // If not yet inited + nvar++; + } else { + flag_all[i >> 5] ^= (1u << (i % 32)); + if ((ncchkp->vars.data + i)->dimsize[0] < ncchkp->recsize) { + ncchkioi_var_resize (ncchkp, ncchkp->vars.data + i); + } + } + } + } + varps = (NC_chk_var **)NCI_Malloc (sizeof (NC_chk_var *) * nvar); + CHK_PTR (varps) + vmap = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + CHK_PTR (vmap) + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + varps[nvar] = ncchkp->vars.data + i; + vmap[i] = nvar++; + } + } + + // Count reqs for each var + roff = (int *)NCI_Malloc (sizeof (int) * (nvar + 1)); + CHK_PTR (roff) + rcnt = (int *)NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (rcnt) + memset (rcnt, 0, sizeof (int) * nvar); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { rcnt[vmap[j]] += req->nreq; } + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { rcnt[vmap[j]] += req->nreq; } + } + roff[0] = 0; + for (i = 0; i < nvar; i++) { roff[i + 1] = roff[i] + rcnt[i]; } + + // Gather starts and counts + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * roff[nvar] * 2); + CHK_PTR (starts) + counts = starts + roff[nvar]; + memset (rcnt, 0, sizeof (int) * nvar); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { + j = vmap[req->varid]; + if (req->nreq > 1) { + memcpy (starts + roff[j] + rcnt[j], req->starts, sizeof (MPI_Offset *) * req->nreq); + memcpy (counts + roff[j] + rcnt[j], req->counts, sizeof (MPI_Offset *) * req->nreq); + rcnt[j] += req->nreq; + } else { + starts[roff[j] + rcnt[j]] = req->start; + counts[roff[j] + (rcnt[j]++)] = req->count; + } + } + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { + j = vmap[req->varid]; + if (req->nreq > 1) { + memcpy (starts + roff[j] + rcnt[j], req->starts, sizeof (MPI_Offset *) * req->nreq); + memcpy (counts + roff[j] + rcnt[j], req->counts, sizeof (MPI_Offset *) * req->nreq); + rcnt[j] += req->nreq; + } else { + starts[roff[j] + rcnt[j]] = req->start; + counts[roff[j] + (rcnt[j]++)] = req->count; + } + } + } + + // Buffer for index table type + lens = NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (lens) + fdisps = NCI_Malloc (sizeof (MPI_Aint) * nvar * 2); + CHK_PTR (fdisps) + mdisps = fdisps + nvar; + nread = 0; + + // Iinit vars + ncchkp->cown_size = 0; // Reset owner penalty + err = ncchkioi_init_nvar_core_reduce (ncchkp, nvar, varps, rcnt, roff, starts, counts); + CHK_ERR + + // Read the index table for existing variables + // MPI Type to load the index table for existing variables + for (i = 0; i < nvar; i++) { + varp = varps[i]; + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, ((NC *)(ncchkp->ncp))->begin_var, + MPI_BYTE, ftype, "native", MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + for (i = 0; i < nvar; i++) { + ncchkioi_idx_in_swapn (varps[i]->chunk_index, varps[i]->nchunk + 1); + } +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NCI_Free (flag); + NCI_Free (varps); + NCI_Free (vmap); + NCI_Free (roff); + NCI_Free (rcnt); + NCI_Free (starts); + +err_out:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_rd.c b/src/drivers/ncchunkio/ncchkioi_var_rd.c new file mode 100644 index 000000000..26aa47126 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_rd.c @@ -0,0 +1,759 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_load_var (NC_chk *ncchkp, NC_chk_var *varp, int nchunk, int *cids) { + int err=NC_NOERR; + int i; + int cid; + int get_size; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // -1 means all chunks + if (nchunk < 0) { + nchunk = varp->nmychunk; + cids = varp->mychunks; + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + // offset and length of compressed chunks + lens[i] = varp->chunk_index[cid].len; + fdisps[i] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[i] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[i]; + } + + // Allocate buffer for compressed data + zbufs[0] = (char *)NCI_Malloc (bsize); + for (i = 1; i < nchunk; i++) { + zbufs[i] = zbufs[i - 1] + varp->chunk_index[cids[i - 1]].len; + } + + ncchkioi_sort_file_offset (nchunk, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_INIT) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_INIT) + } + + // Decompress each chunk + // Allocate chunk cache if not allocated + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + dsize = varp->chunksize; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + varp->filter_driver->decompress (zbufs[i], lens[i], varp->chunk_cache[cid]->buf, &dsize, + varp->ndim, varp->chunkdim, varp->etype); + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, NC_CHK_TIMER_GET_IO_INIT) + + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + } + varp->filter_driver->finalize (); + } else { + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[i], lens[i]); + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, NC_CHK_TIMER_GET_IO_INIT) + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO) + +err_out:; + return err; +} + +int ncchkioi_load_nvar (NC_chk *ncchkp, int nvar, int *varids, int *lo, int *hi) { + int err=NC_NOERR; + int i, j, k; + int cid; + int get_size; + + int nchunk; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // -1 means all chunks + nchunk = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { nchunk++; } + } + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // We only need to read when it is not in cache + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { + // offset and length of compressed chunks + lens[k] = varp->chunk_index[cid].len; + fdisps[k] = (MPI_Aint) (varp->chunk_index[cid].off + ncp->begin_var); + mdisps[k] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[k++]; + } + } + } + + // Allocate buffer for compressed data + // We allocate it continuously so no mem type needed + zbufs[0] = (char *)NCI_Malloc (bsize); + for (j = 1; j < nchunk; j++) { zbufs[j] = zbufs[j - 1] + lens[j - 1]; } + + ncchkioi_sort_file_offset (k, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_CACHE) + + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + dsize = varp->chunksize; + + // Decompress each chunk + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + // Perform decompression + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + varp->filter_driver->decompress (zbufs[k], lens[k], varp->chunk_cache[cid]->buf, + &dsize, varp->ndim, varp->chunkdim, varp->etype); + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, + NC_CHK_TIMER_GET_IO_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + varp->filter_driver->finalize (); + } else { + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[k], lens[k]); + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, + NC_CHK_TIMER_GET_IO_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_CACHE) + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_CACHE) + + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_CACHE, NC_CHK_TIMER_GET_IO_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_RD) + } + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO) + +err_out:; + return err; +} + +int ncchkioi_load_var_bg (NC_chk *ncchkp, NC_chk_var *varp, int nchunk, int *cids) { + int err=NC_NOERR; + int i; + int cid; + int get_size; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_INIT) + + // -1 means all chunks + if (nchunk < 0) { + nchunk = varp->nmychunk; + cids = varp->mychunks; + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + // offset and length of compressed chunks + lens[i] = varp->chunk_index[cid].len; + fdisps[i] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[i] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[i]; + } + + // Allocate buffer for compressed data + zbufs[0] = (char *)NCI_Malloc (bsize); + for (i = 1; i < nchunk; i++) { + zbufs[i] = zbufs[i - 1] + varp->chunk_index[cids[i - 1]].len; + } + + ncchkioi_sort_file_offset (nchunk, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + +#ifdef _USE_MPI_PUT_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + + // Decompress each chunk + // Allocate chunk cache if not allocated + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + dsize = varp->chunksize; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + varp->filter_driver->decompress (zbufs[i], lens[i], varp->chunk_cache[cid]->buf, &dsize, + varp->ndim, varp->chunkdim, varp->etype); + + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + } + varp->filter_driver->finalize (); + } else { + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + memcpy (varp->chunk_cache[cid]->buf, zbufs[i], lens[i]); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_DECOM) + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG) + +err_out:; + return err; +} + +int ncchkioi_load_nvar_bg (NC_chk *ncchkp, int nvar, int *varids, int *lo, int *hi) { + int err=NC_NOERR; + int i, j, k; + int cid; + int get_size; + + int nchunk; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_INIT) + + // -1 means all chunks + nchunk = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { nchunk++; } + } + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // We only need to read when it is not in cache + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { + // offset and length of compressed chunks + lens[k] = varp->chunk_index[cid].len; + fdisps[k] = (MPI_Aint) (varp->chunk_index[cid].off + ncp->begin_var); + mdisps[k] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[k++]; + } + } + } + + // Allocate buffer for compressed data + // We allocate it continuously so no mem type needed + zbufs[0] = (char *)NCI_Malloc (bsize); + for (j = 1; j < nchunk; j++) { zbufs[j] = zbufs[j - 1] + lens[j - 1]; } + + ncchkioi_sort_file_offset (k, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef _USE_MPI_PUT_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_RD, NC_CHK_TIMER_PUT_BG_CACHE) + + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + dsize = varp->chunksize; + + // Decompress each chunk + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + // Perform decompression + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + varp->filter_driver->decompress (zbufs[k], lens[k], varp->chunk_cache[cid]->buf, + &dsize, varp->ndim, varp->chunkdim, varp->etype); + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_BG_DECOM, + NC_CHK_TIMER_PUT_BG_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + // Cache is always up to date, no need to read and decompress + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + varp->filter_driver->finalize (); + } else { + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[k], lens[k]); + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_BG_DECOM, + NC_CHK_TIMER_PUT_BG_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + // Cache is always up to date, no need to read and decompress + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_CACHE) + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_INIT, NC_CHK_TIMER_PUT_BG_CACHE) + + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_CACHE, NC_CHK_TIMER_PUT_BG_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + } + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_resize.c b/src/drivers/ncchunkio/ncchkioi_var_resize.c new file mode 100644 index 000000000..cf8d98f24 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_resize.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_var_resize (NC_chk *ncchkp, NC_chk_var *varp) { + int err=NC_NOERR; + int i; + int cid; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED && varp->isrec) { + if (varp->dimsize[0] < ncchkp->recsize) { + int oldnchunk; + int oldnmychunk; + + // oldnrec = varp->nrec; + oldnchunk = varp->nchunk; + oldnmychunk = varp->nmychunk; + varp->nrec = varp->dimsize[0] = varp->nchunks[0] = ncchkp->recsize; + varp->nchunk = varp->nchunkrec * varp->nrec; + + // Extend metadata list if needed + if (varp->nrec > varp->nrecalloc) { + while (varp->nrecalloc < varp->nrec) { varp->nrecalloc *= NC_CHK_REC_MULTIPLIER; } + varp->nchunkalloc = varp->nrecalloc * varp->nchunkrec; + + varp->chunk_owner = + (int *)NCI_Realloc (varp->chunk_owner, sizeof (int) * varp->nchunkalloc); + varp->dirty = (int *)NCI_Realloc (varp->dirty, sizeof (int) * varp->nchunkalloc); + varp->chunk_cache = (NC_chk_cache **)NCI_Realloc ( + varp->chunk_cache, sizeof (char *) * varp->nchunkalloc); + for (i = 0; i < oldnmychunk; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] != NULL) { + varp->chunk_cache[cid]->ref = varp->chunk_cache + cid; + } + } + + varp->chunk_index = (NC_chk_chunk_index_entry *)NCI_Realloc ( + varp->chunk_index, sizeof (NC_chk_chunk_index_entry) * (varp->nchunkalloc + 1)); + varp->mychunks = (int *)NCI_Realloc ( + varp->mychunks, sizeof (int) * varp->nrecalloc * varp->nmychunkrec); + + varp->expanded = 1; + } + memset (varp->chunk_index + oldnchunk, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk - oldnchunk)); + memset (varp->dirty + oldnchunk, 0, sizeof (int) * (varp->nchunk - oldnchunk)); + memset (varp->chunk_cache + oldnchunk, 0, sizeof (char *) * (varp->nchunk - oldnchunk)); + + // Extend block ownership list + if (oldnchunk > 0) { + for (i = oldnchunk; i < varp->nchunk; i += varp->nchunkrec) { + // We reuse chunk mapping of other records + memcpy (varp->chunk_owner + i, varp->chunk_owner, + sizeof (int) * varp->nchunkrec); + } + varp->nmychunk = varp->nmychunkrec * varp->nrec; + for (i = oldnmychunk; i < varp->nmychunk; i += varp->nmychunkrec) { + // We reuse chunk mapping of other records + memcpy (varp->mychunks + i, varp->mychunks, sizeof (int) * varp->nmychunkrec); + } + } else { + err = ncchkioi_calc_chunk_owner (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + varp->nmychunkrec = 0; + for (i = 0; i < varp->nchunkrec; i++) { + if (varp->chunk_owner[i] == ncchkp->rank) { varp->nmychunkrec++; } + } + varp->mychunks = + (int *)NCI_Realloc (varp->mychunks, sizeof (int) * varp->nmychunkrec * varp->nrecalloc); + + if (ncchkp->cache_limit_hint == -1) { + ncchkp->cache_limit += + (size_t) (varp->nmychunkrec) * (size_t) (varp->chunksize); + } + } + + varp->nmychunk = oldnmychunk; + for (i = oldnchunk; i < varp->nchunk; i++) { + if (varp->chunk_owner[i] == ncchkp->rank) { + varp->mychunks[varp->nmychunk++] = i; + // varp->chunk_cache[i] = (void*)NCI_Malloc(varp->chunksize); // Allocate + // buffer for blocks we own memset(varp->chunk_cache[i], 0 , varp->chunksize); + } + } + + // Update global chunk count + ncchkp->nmychunks += (MPI_Offset) (varp->nmychunk - oldnmychunk); + } + } else { + // Notify ncmpio driver + } + +err_out:; + return err; +} + +int ncchkioi_resize_nvar (NC_chk *ncchkp, int nput, int *putreqs, int nget, int *getreqs) { + int err=NC_NOERR; + int i; + int nflag; + unsigned int *flag = NULL, *flag_all; + NC_chk_req *req; + + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + CHK_PTR (flag) + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Resize each var + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + flag_all[i >> 5] ^= (1u << (i % 32)); + if ((ncchkp->vars.data + i)->dimsize[0] < ncchkp->recsize) { + err = ncchkioi_var_resize (ncchkp, ncchkp->vars.data + i); + CHK_ERR + } + } + } + + NCI_Free (flag); + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_wr.c b/src/drivers/ncchunkio/ncchkioi_var_wr.c new file mode 100644 index 000000000..e0fa8e12a --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_wr.c @@ -0,0 +1,636 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_save_var (NC_chk *ncchkp, NC_chk_var *varp) { + int i, k, l, err = NC_NOERR; + int *zsizes = NULL, *zsizes_all = NULL; + MPI_Datatype mtype, ftype; // Memory and file datatype + int wcnt; + int *lens = NULL; + MPI_Aint *disps = NULL; + MPI_Status status; + MPI_Offset *zoffs = NULL; + MPI_Offset voff; + void **zbufs = NULL; + int zdimid, zvarid; + int put_size; + char name[128]; // Name of objects + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO) + + // Allocate buffer for compression + zsizes = (int *)NCI_Malloc (sizeof (int) * varp->nchunk); + CHK_PTR (zsizes) + zbufs = (void **)NCI_Malloc (sizeof (void *) * varp->nmychunk); + CHK_PTR (zbufs) + zsizes_all = (int *)NCI_Malloc (sizeof (int) * varp->nchunk); + CHK_PTR (zsizes_all) + zoffs = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * (varp->nchunk + 1)); + CHK_PTR (zoffs) + + // Allocate buffer for I/O + wcnt = 0; + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { wcnt++; } + } + if (ncchkp->rank == varp->chunk_owner[0]) { wcnt += 1; } + lens = (int *)NCI_Malloc (sizeof (int) * wcnt); + CHK_PTR (lens) + disps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (disps) + + memset (zsizes, 0, sizeof (int) * varp->nchunk); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_COM) + + // Compress each chunk we own + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + + if (varp->dirty[k]) { + // Apply compression + err = varp->filter_driver->compress_alloc (varp->chunk_cache[k]->buf, varp->chunksize, + zbufs + l, zsizes + k, varp->ndim, varp->chunkdim, + varp->etype); + CHK_ERR + } + } + varp->filter_driver->finalize (); + } else { + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { + zbufs[l] = varp->chunk_cache[k]->buf; + zsizes[k] = varp->chunksize; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_COM) + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_BARR) +#endif + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_SYNC) + + // Sync compressed data size with other processes + CHK_ERR_ALLREDUCE (zsizes, zsizes_all, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm); + + if (varp->metaoff < 0 || varp->expanded) { + zoffs[0] = varp->nchunkalloc * sizeof (NC_chk_chunk_index_entry); + } else { + zoffs[0] = 0; + } + for (i = 0; i < varp->nchunk; i++) { zoffs[i + 1] = zoffs[i] + zsizes_all[i]; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + if (zoffs[varp->nchunk] > 0) { // No need to do I/O if no dirty chunk to write + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + /* Write comrpessed variable + * We start by defining data variable and writing metadata + * Then, we create buffer type and file type for data + * Finally MPI collective I/O is used for writing data + */ + + // Enter redefine mode + ncchkp->driver->redef (ncchkp->ncp); + + // Prepare data variable + + // Define dimension for data variable + sprintf (name, "_datablock_dim_%d", ncchkp->nwrite); + err = ncchkp->driver->def_dim (ncchkp->ncp, name, zoffs[varp->nchunk], &zdimid); + if (err != NC_NOERR) return err; + + // Define data variable + sprintf (name, "_datablock_%d", ncchkp->nwrite); + err = ncchkp->driver->def_var (ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &(zvarid)); + if (err != NC_NOERR) return err; + + // Mark as data variable + i = NC_CHK_VAR_DATA; + err = ncchkp->driver->put_att (ncchkp->ncp, zvarid, "_varkind", NC_INT, 1, &i, MPI_INT); + if (err != NC_NOERR) return err; + + // Record serial + ncchkp->nwrite++; + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_nwrite", NC_INT, 1, + &(ncchkp->nwrite), MPI_INT); + if (err != NC_NOERR) return err; + + // Metadata offset + // Real metadata offset is only known after enddef + // We reserve the space so we don't need to enter define mode again + if (varp->metaoff < 0) { + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + } + + // Switch to data mode + err = ncchkp->driver->enddef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + // Update metadata + voff = ncp->vars.value[zvarid]->begin; + for (i = 0; i < varp->nchunk; i++) { + if (zsizes_all[i] > 0) { + varp->chunk_index[i].len = zsizes_all[i]; + varp->chunk_index[i].off = zoffs[i] + voff - ncp->begin_var; + } + } + + if (varp->metaoff < 0 || varp->expanded) { + varp->metaoff = voff - ncp->begin_var; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + + // unset expand flag + varp->expanded = 0; + } + + /* Carry out coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed + * when count is 0 We use a dummy call inplace of type with 0 count + */ + if (wcnt > 0) { + // Create file type + l = 0; + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[l] = (varp->nchunk) * sizeof (NC_chk_chunk_index_entry); + disps[l++] = (MPI_Aint)varp->metaoff + ncp->begin_var; + } + for (i = 0; i < varp->nmychunk; i++) { + k = varp->mychunks[i]; + + // Record compressed size + if (varp->dirty[k]) { + lens[l] = zsizes[k]; + disps[l++] = (MPI_Aint) (varp->chunk_index[k].off) + ncp->begin_var; + } + } + MPI_Type_create_hindexed (wcnt, lens, disps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + // Create memory buffer type + l = 0; + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[l] = (varp->nchunk) * sizeof (NC_chk_chunk_index_entry); + disps[l++] = (MPI_Aint)varp->chunk_index; + } + for (i = 0; i < varp->nmychunk; i++) { + k = varp->mychunks[i]; + + // Record compressed size + if (varp->dirty[k]) { + lens[l] = zsizes[k]; + disps[l++] = (MPI_Aint)zbufs[i]; + } + } + err = MPI_Type_create_hindexed (wcnt, lens, disps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + +#ifdef WORDS_BIGENDIAN // NetCDF data is big endian + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk); + } +#endif + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk); + } +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &put_size); +#else + MPI_Type_size (mtype, &put_size); +#endif + ncchkp->putsize += put_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + } + } + +err_out:; + // Free buffers + NCI_Free (zsizes); + NCI_Free (zsizes_all); + NCI_Free (zoffs); + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { + if (varp->filter_driver != NULL) { free (zbufs[l]); } + // Clear dirty flag + varp->dirty[k] = 0; + } + } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (disps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO) + + return err; +} + +int ncchkioi_save_nvar (NC_chk *ncchkp, int nvar, int *varids) { + int i, k, l, err = NC_NOERR; + int vid; // Iterator for variable id + int cid; // Iterator for chunk id + int total_nchunks = 0; + int *zsizes = NULL, *zsizes_all = NULL, *zsizesp = NULL, *zsizes_allp = NULL; + MPI_Offset *zoffs = NULL, *zoffsp; + MPI_Offset voff; + MPI_Datatype mtype, ftype; // Memory and file datatype + int wcnt, ccnt, wcur, ccur; + int *lens = NULL; + MPI_Aint *mdisps = NULL, *fdisps = NULL; + MPI_Status status; + MPI_Request *reqs = NULL; + int put_size; + void **zbufs = NULL; + int *zdels = NULL; + int zdimid, zvarid; + char name[128]; // Name of objects + NC_chk_var *varp; + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + wcnt = 0; + ccnt = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + if (ncchkp->rank == varp->chunk_owner[0]) { wcnt += 1; } + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { ccnt++; } + } + total_nchunks += varp->nchunk + 1; + } + wcnt += ccnt; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_INIT) + + // Allocate reqid for metadata + reqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nvar); + CHK_PTR (reqs) + + // Allocate buffer for compression + zsizes = (int *)NCI_Malloc (sizeof (int) * total_nchunks); + CHK_PTR (zsizes) + zsizes_all = (int *)NCI_Malloc (sizeof (int) * total_nchunks); + CHK_PTR (zsizes_all) + zbufs = (void **)NCI_Malloc (sizeof (void *) * ccnt); + CHK_PTR (zbufs) + zdels = (int *)NCI_Malloc (sizeof (int) * ccnt); + CHK_PTR (zdels) + zoffs = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * (total_nchunks + 1)); + CHK_PTR (zoffs) + + // Allocate buffer file type + mdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (mdisps) + lens = (int *)NCI_Malloc (sizeof (int) * wcnt); + CHK_PTR (lens) + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (fdisps) + + ccur = 0; + zsizesp = zsizes + nvar; + zsizes_allp = zsizes_all + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_COM) + + // oldzoff = zoffs[varp->nchunk]; + + memset (zsizesp, 0, sizeof (int) * varp->nchunk); + + // Compress each chunk we own + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + for (l = 0; l < varp->nmychunk; l++) { + cid = varp->mychunks[l]; + + // Apply compression + if (varp->dirty[cid]) { + zdels[ccur] = 1; + err = varp->filter_driver->compress_alloc (varp->chunk_cache[cid]->buf, varp->chunksize, + zbufs + (ccur++), zsizesp + cid, varp->ndim, + varp->chunkdim, varp->etype); + CHK_ERR + } + } + varp->filter_driver->finalize (); + } else { + for (l = 0; l < varp->nmychunk; l++) { + cid = varp->mychunks[l]; + if (varp->dirty[cid]) { + zsizesp[cid] = varp->chunksize; + zdels[ccur] = 0; + zbufs[ccur++] = varp->chunk_cache[cid]->buf; + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_COM, NC_CHK_TIMER_PUT_IO_SYNC) + + // Sync compressed data size with other processes + CHK_ERR_IALLREDUCE (zsizesp, zsizes_allp, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm, + reqs + vid); + + if (varp->metaoff < 0 || varp->expanded) { + zsizes_all[vid] = varp->nchunkalloc * sizeof (NC_chk_chunk_index_entry); + } else { + zsizes_all[vid] = 0; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + zsizesp += varp->nchunk; + zsizes_allp += varp->nchunk; + } + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_BARR) +#endif + + /* Write comrpessed variable + * We start by defining data variable and writing metadata + * Then, we create buffer type and file type for data + * Finally MPI collective I/O is used for writing data + */ + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_SYNC) + zsizes_allp = zsizes_all + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + CHK_ERR_WAIT (reqs + vid, &status); + zsizes_allp += varp->nchunk; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + zoffs[0] = 0; + for (i = 0; i < total_nchunks; i++) { zoffs[i + 1] = zoffs[i] + zsizes_all[i]; } + + if (zoffs[total_nchunks] > 0) { // No need to do I/O if no dirty chunk to write + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + // Prepare data variable + + // Enter redefine mode + ncchkp->driver->redef (ncchkp->ncp); + + // Define dimension for data variable + sprintf (name, "_datablock_dim_%d", ncchkp->nwrite); + err = ncchkp->driver->def_dim (ncchkp->ncp, name, zoffs[total_nchunks], &zdimid); + if (err != NC_NOERR) return err; + + // Define data variable + sprintf (name, "_datablock_%d", ncchkp->nwrite); + err = ncchkp->driver->def_var (ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &zvarid); + if (err != NC_NOERR) return err; + + // Mark as data variable + i = NC_CHK_VAR_DATA; + err = ncchkp->driver->put_att (ncchkp->ncp, zvarid, "_varkind", NC_INT, 1, &i, MPI_INT); + if (err != NC_NOERR) return err; + + // Record serial + ncchkp->nwrite++; + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_nwrite", NC_INT, 1, + &(ncchkp->nwrite), MPI_INT); + if (err != NC_NOERR) return err; + + // Metadata offset + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + // Reserve space for _metaoffset + if (varp->metaoff < 0) { + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + } + } + + // Switch back to data mode + err = ncchkp->driver->enddef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + voff = ncp->vars.value[zvarid]->begin; + + wcur = ccur = 0; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + if (varp->metaoff < 0 || varp->expanded) { + varp->metaoff = zoffs[vid] + voff - ncp->begin_var; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + + // unset expand flag + varp->expanded = 0; + } + + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[wcur] = varp->nchunk * sizeof (NC_chk_chunk_index_entry); + fdisps[wcur] = (MPI_Aint)varp->metaoff + ncp->begin_var; + mdisps[wcur++] = (MPI_Aint) (varp->chunk_index); + + // lens[wcur] = varp->nchunk * sizeof(int); + // fdisps[wcur] = (MPI_Aint)(varp->metaoff + ncp->begin_var + sizeof(long long) * + // varp->nchunkalloc); mdisps[wcur++] = (MPI_Aint)(varp->data_lens); + } + } + + ncchkioi_sort_file_offset (wcur, fdisps, mdisps, lens); + + zsizes_allp = zsizes_all + nvar; + zoffsp = zoffs + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + for (cid = 0; cid < varp->nchunk; cid++) { + if (zsizes_allp[cid] > 0) { + varp->chunk_index[cid].len = zsizes_allp[cid]; + varp->chunk_index[cid].off = zoffsp[cid] + voff - ncp->begin_var; + } + } + + /* Paramemter for file and memory type + * We do not know variable file offset until the end of define mode + * We will add the displacement later + */ + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + // Record parameter + if (varp->dirty[cid]) { + lens[wcur] = varp->chunk_index[cid].len; + fdisps[wcur] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[wcur++] = (MPI_Aint)zbufs[ccur++]; + } + } + + // Clear dirty flag + memset (varp->dirty, 0, varp->nchunk * sizeof (int)); + + zsizes_allp += varp->nchunk; + zoffsp += varp->nchunk; + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed + * when count is 0 We use a dummy call inplace of type with 0 count + */ + if (wcnt > 0) { + // Create file type + MPI_Type_create_hindexed (wcnt, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + // Create memmory type + MPI_Type_create_hindexed (wcnt, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + +#ifdef WORDS_BIGENDIAN // NetCDF data is big endian + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); + } + } +#endif + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); + } + } +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &put_size); +#else + MPI_Type_size (mtype, &put_size); +#endif + ncchkp->putsize += put_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + } + } + +err_out:; + // Free buffers + NCI_Free (zsizes); + NCI_Free (zsizes_all); + NCI_Free (zoffs); + ccur = 0; + for (i = 0; i < ccnt; i++) { + if (zdels[i]) { free (zbufs[i]); } + } + NCI_Free (zbufs); + NCI_Free (zdels); + + NCI_Free (lens); + NCI_Free (fdisps); + NCI_Free (mdisps); + + NCI_Free (reqs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_vector.c b/src/drivers/ncchunkio/ncchkioi_vector.c new file mode 100644 index 000000000..adf519eb0 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_vector.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include "ncchkio_internal.h" + +#define STARTSIZE 32 +#define SIZEMUTIPLIER 20 + +int ncchkioi_vector_init(NC_chk_vector *v, int esize){ + v->esize = esize; + v->nalloc = STARTSIZE; + v->size = 0; + v->data = (char*)NCI_Malloc(esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } +} + +int ncchkioi_vector_init_ex(NC_chk_vector *v, int esize, int size){ + v->esize = esize; + v->nalloc = size; + v->size = 0; + v->data = (char*)NCI_Malloc(esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } +} + +void ncchkioi_vector_free(NC_chk_vector *v){ + NCI_Free(v->data); +} + +int ncchkioi_vector_append(NC_chk_vector *v, void *item){ + if (v->size == v->nalloc){ + v->nalloc = v->nalloc * SIZEMUTIPLIER; + v->data = (char*)NCI_Realloc(v->data, v->esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + } + memcpy(data + v->size * v->esize, item, v->esize); +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_wait.c b/src/drivers/ncchunkio/ncchkioi_wait.c new file mode 100644 index 000000000..35be331ce --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_wait.c @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_wait_put_reqs (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i; + int nvar, nflag; + unsigned int *flag, *flag_all; + int *vids; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_PUT) + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { nvar++; } + } + vids = (int *)NCI_Malloc (sizeof (int) * nvar); + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { vids[nvar++] = i; } + } + + // Perform collective buffer + if (ncchkp->comm_unit == NC_CHK_COMM_CHUNK) { + err = ncchkioi_iput_cb_chunk (ncchkp, nreq, reqids, stats); + } else { + err = ncchkioi_iput_cb_proc (ncchkp, nreq, reqids, stats); + } + CHK_ERR + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_PUT_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_PUT_BARR) +#endif + + // Perform I/O for comrpessed variables + err = ncchkioi_save_nvar (ncchkp, nvar, vids); + CHK_ERR + +err_out:; + + // Free buffers + NCI_Free (vids); + NCI_Free (flag); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_PUT) + + return err; +} + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_wait_get_reqs (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i; + int nvar, nflag; + unsigned int *flag, *flag_all; + int *vids; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_GET) + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { nvar++; } + } + vids = (int *)NCI_Malloc (sizeof (int) * nvar); + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { vids[nvar++] = i; } + } + + // Perform I/O for comrpessed variables + // ncchkioi_load_nvar(ncchkp, nvar, vids); + + // Perform collective buffer + if (ncchkp->comm_unit == NC_CHK_COMM_CHUNK) { + err = ncchkioi_iget_cb_chunk (ncchkp, nreq, reqids, stats); + } else { + err = ncchkioi_iget_cb_proc (ncchkp, nreq, reqids, stats); + // ncchkioi_iget_cb_chunk(ncchkp, nreq, reqids, stats); + } + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CONVERT) + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + if (req->buf != req->xbuf) { + void *cbuf = (void *)req->buf; + + err = ncchkioiconvert (req->xbuf, cbuf, ncchkp->vars.data[req->varid].etype, + req->buftype, req->bufcount); + CHK_ERR + + if (cbuf != req->buf) NCI_Free (cbuf); + } + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CONVERT) + +err_out:; + + // Free buffers + NCI_Free (vids); + NCI_Free (flag); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_GET) + + return err; +} + +int ncchkioi_wait (NC_chk *ncchkp, int nreqs, int *reqids, int *stats, int reqMode) { + int err=NC_NOERR; + int i; + int nput = 0, nget = 0; + int *putreqs = NULL, *getreqs = NULL; + int *putstats = NULL, *getstats = NULL; + + if (nreqs == NC_REQ_ALL || nreqs == NC_PUT_REQ_ALL) { + nput = ncchkp->putlist.nused; + putreqs = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putreqs) + memcpy (putreqs, ncchkp->putlist.ids, nput * sizeof (int)); + } + if (nreqs == NC_REQ_ALL || nreqs == NC_GET_REQ_ALL) { + nget = ncchkp->getlist.nused; + getreqs = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getreqs) + memcpy (getreqs, ncchkp->getlist.ids, nget * sizeof (int)); + } + + if (nreqs > 0) { + // Count number of get and put requests + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { nput++; } + } + + // Allocate buffer + nget = nreqs - nput; + putreqs = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putreqs) + getreqs = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getreqs) + + // Build put and get req list + nput = nget = 0; + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { + putreqs[nput++] = reqids[i] >> 1; + } else { + getreqs[nget++] = reqids[i] >> 1; + } + } + } + + if (ncchkp->delay_init) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_WAIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_init_nvar (ncchkp, nput, putreqs, nget, getreqs); // nput + nget = real nreq + if (err != NC_NOERR) { return err; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + } else { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_WAIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + // Sync number of rec + err = + ncchkioi_resize_nvar (ncchkp, nput, putreqs, nget, getreqs); // nput + nget = real nreq + if (err != NC_NOERR) { return err; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + } + + if (stats != NULL) { + putstats = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putstats) + getstats = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getstats) + memset (putstats, 0, sizeof (int) * nput); + memset (getstats, 0, sizeof (int) * nget); + } else { + putstats = NULL; + getstats = NULL; + } + + if ((ncchkp->mode & NC_WRITE) && nreqs != NC_GET_REQ_ALL) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + err = ncchkioi_wait_put_reqs (ncchkp, nput, putreqs, putstats); + CHK_ERR + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + } + + if (nreqs != NC_PUT_REQ_ALL) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + err = ncchkioi_wait_get_reqs (ncchkp, nget, getreqs, getstats); + CHK_ERR + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + } + + // Assign stats + if (stats != NULL) { + nput = nget = 0; + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { + stats[i] = putstats[nput++]; + } else { + stats[i] = getstats[nget++]; + } + } + + NCI_Free (putstats); + NCI_Free (getstats); + } + + // Remove from req list + for (i = 0; i < nput; i++) { ncchkioi_req_list_remove (&(ncchkp->putlist), putreqs[i]); } + for (i = 0; i < nget; i++) { ncchkioi_req_list_remove (&(ncchkp->getlist), getreqs[i]); } + +err_out:; + NCI_Free (putreqs); + NCI_Free (getreqs); + + return err; +} diff --git a/src/include/dispatch.h b/src/include/dispatch.h index 5251b2690..577aa8b99 100644 --- a/src/include/dispatch.h +++ b/src/include/dispatch.h @@ -161,6 +161,8 @@ extern PNC_driver* ncfoo_inq_driver(void); extern PNC_driver* ncbbio_inq_driver(void); +extern PNC_driver* ncchkio_inq_driver(void); + extern int PNC_check_id(int ncid, PNC **pncp); #endif /* H_PNC_DISPATCH */ diff --git a/src/include/pnetcdf.h.in b/src/include/pnetcdf.h.in index 6ce7499c6..198fabbe1 100644 --- a/src/include/pnetcdf.h.in +++ b/src/include/pnetcdf.h.in @@ -794,6 +794,18 @@ extern int ncmpi_def_var(int ncid, const char *name, nc_type xtype, int ndims, const int *dimidsp, int *varidp); +#define NC_FILTER_NONE 0 +#define NC_FILTER_DEFLATE 2 +#define NC_FILTER_SZ 3 +extern int +ncmpi_var_set_chunk (int ncid, int varid, int *chunk_dim); +extern int +ncmpi_var_get_chunk (int ncid, int varid, int *chunk_dim); +extern int +ncmpi_var_set_filter (int ncid, int varid, int filter); +extern int +ncmpi_var_get_filter (int ncid, int varid, int *filter); + extern int ncmpi_rename_dim(int ncid, int dimid, const char *name); diff --git a/src/libs/Makefile.am b/src/libs/Makefile.am index 17ea3ed75..e93cf6086 100644 --- a/src/libs/Makefile.am +++ b/src/libs/Makefile.am @@ -33,6 +33,9 @@ endif if ENABLE_BURST_BUFFER libpnetcdf_la_LIBADD += ../drivers/ncbbio/libncbbio.la endif +if ENABLE_CHUNKING + libpnetcdf_la_LIBADD += ../drivers/ncchunkio/libncchkio.la +endif if ENABLE_ADIOS libpnetcdf_la_LIBADD += ../drivers/ncadios/libncadios.la endif diff --git a/test/nc_test/tst_atts3.c b/test/nc_test/tst_atts3.c index d38c5a3a9..8ae357cb6 100644 --- a/test/nc_test/tst_atts3.c +++ b/test/nc_test/tst_atts3.c @@ -153,7 +153,7 @@ tst_atts3(char *filename, int cmode) /* Create a file with some global atts. */ err=ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL,&ncid); ERR for (j = 0; j < NUM_SIMPLE_ATTS; j++) { - err=ncmpi_put_att_int(ncid, NC_GLOBAL, name[j], NC_INT, 0, NULL); ERR + err=ncmpi_put_att_int(ncid, NC_GLOBAL, name[j], NC_INT, 0, NULL); ERR } err=ncmpi_close(ncid); ERR diff --git a/test/testcases/Makefile.am b/test/testcases/Makefile.am index e094d790f..b0266dc8d 100644 --- a/test/testcases/Makefile.am +++ b/test/testcases/Makefile.am @@ -107,7 +107,8 @@ check_PROGRAMS = file_create_open \ put_all_kinds \ redef1 \ iput_all_kinds \ - tst_version + tst_version \ + tst_chunk_nonblocking M4_SRCS = put_all_kinds.m4 \ erange_fill.m4 \ diff --git a/test/testcases/tst_chunk_nonblocking.c b/test/testcases/tst_chunk_nonblocking.c new file mode 100644 index 000000000..6cf7ce866 --- /dev/null +++ b/test/testcases/tst_chunk_nonblocking.c @@ -0,0 +1,107 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This program tests chunking feature when using nonblocking APIs and one of + * the processes makes no call to the API. + * + * Contributed by Danqing Wu. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include +#include /* basename() */ + +#include +#include +#include + + +#define DIM_LEN 8 + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err, nerrs=0, ncid, dimid, varid, rank, req, verbose=0; + int vals[DIM_LEN] = {-1, -2, -3, -4, -5, -6, -7, -8}; + MPI_Offset start, count; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* enable chunking */ + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR + + err = ncmpi_def_dim(ncid, "x", DIM_LEN, &dimid); + CHECK_ERR + err = ncmpi_def_var(ncid, "var", NC_INT, 1, &dimid, &varid); + CHECK_ERR + + err = ncmpi_enddef(ncid); + CHECK_ERR + + if (rank == 0) + { + start = 0; + count = DIM_LEN; + err = ncmpi_iput_vara_int(ncid, varid, &start, &count, vals, &req); + CHECK_ERR + } + else + req = NC_REQ_NULL; + + if (verbose) printf("rank = %d, before ncmpi_wait_all\n", rank); + err = ncmpi_wait_all(ncid, 1, &req, NULL); + CHECK_ERR + if (verbose) printf("rank = %d, after ncmpi_wait_all\n", rank); + + err = ncmpi_close(ncid); + + return nerrs; +} + +int main(int argc, char **argv) +{ + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "chunking using nonblocking APIs", opt, test_io); + + MPI_Finalize(); + + return err; +}