diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index b4bc7a950..e54684835 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -39,7 +39,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; #define BUFREAD64(buf,var) memcpy(&var, buf, 8); if (diff_endian) swap_64(&var); #endif -/* Note accessing the following 3 global variables must be protected by a +/* Note accessing the following 5 global variables must be protected by a * mutex, otherwise it will not be thread safe. */ @@ -52,6 +52,12 @@ static int pnc_numfiles; */ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; +/* attribute to be cached in all communicators */ +static int pncio_node_ids_keyval = MPI_KEYVAL_INVALID; + +/* attribute to be cached in MPI_COMM_SELF */ +static int pncio_init_keyval = MPI_KEYVAL_INVALID; + #define NCMPII_HANDLE_ERROR(func) \ if (mpireturn != MPI_SUCCESS) { \ int errorStringLen; \ @@ -69,6 +75,146 @@ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; } \ } +/* struct PNCIO_node_ids is defined in dispatch.h */ + +/*----< PNCIO_node_ids_copy() >----------------------------------------------*/ +/* A function to be invoked when a communicator is duplicated, which adds a + * reference to the already allocated memory space storing node ID array. + */ +static +int PNCIO_node_ids_copy(MPI_Comm comm, + int keyval, + void *extra, + void *attr_inP, + void *attr_outP, + int *flag) +{ + PNCIO_node_ids *attr_in = (PNCIO_node_ids*) attr_inP; + PNCIO_node_ids **attr_out = (PNCIO_node_ids**)attr_outP; + + if (attr_in == NULL) + return MPI_ERR_KEYVAL; + else + attr_in->ref_count++; + + *attr_out = attr_in; + + *flag = 1; /* make a copy in the new communicator */ + + return MPI_SUCCESS; +} + +/*----< PNCIO_node_ids_delete() >--------------------------------------------*/ +/* Callback function to be called when a communicator is freed, which frees the + * allocated memory space of node ID array. + */ +static +int PNCIO_node_ids_delete(MPI_Comm comm, + int keyval, + void *attr_val, + void *extra) +{ + PNCIO_node_ids *node_ids = (PNCIO_node_ids*) attr_val; + + if (node_ids == NULL) + return MPI_ERR_KEYVAL; + else + node_ids->ref_count--; + + if (node_ids->ref_count <= 0) { + /* free the allocated array */ + if (node_ids->ids != NULL) + free(node_ids->ids); + free(node_ids); + } + return MPI_SUCCESS; +} + +/*----< PNCIO_end_call() >---------------------------------------------------*/ +/* Callback function to be called at MPI_Finalize(), which frees all cached + * attributes. + */ +static +int PNCIO_end_call(MPI_Comm comm, + int keyval, + void *attribute_val, + void *extra_state) +{ + /* Free all keyvals used by PnetCDF */ + + MPI_Comm_free_keyval(&keyval); /* free pncio_init_keyval */ + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) + MPI_Comm_free_keyval(&pncio_node_ids_keyval); + + return MPI_SUCCESS; +} + +/*----< set_get_comm_attr() >------------------------------------------------*/ +/* Create/set/get attributes into/from the MPI communicators passed in from the + * user application. + */ +static +void set_get_comm_attr(MPI_Comm comm, + PNCIO_node_ids *node_idsP) +{ + PNCIO_node_ids *node_ids; + + if (pncio_init_keyval == MPI_KEYVAL_INVALID) { + /* This is the first call ever to PnetCDF API. Creating key + * pncio_init_keyval is necessary for MPI_Finalize() to free key + * pncio_node_ids_keyval. + */ + MPI_Comm_create_keyval(MPI_NULL_COPY_FN, PNCIO_end_call, + &pncio_init_keyval, (void*)0); + MPI_Comm_set_attr(MPI_COMM_SELF, pncio_init_keyval, (void*)0); + } + + if (pncio_node_ids_keyval == MPI_KEYVAL_INVALID) { + MPI_Comm_create_keyval(PNCIO_node_ids_copy, PNCIO_node_ids_delete, + &pncio_node_ids_keyval, NULL); + /* ignore error, as it is not a critical error */ + } + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) { + int found, nprocs; + + MPI_Comm_get_attr(comm, pncio_node_ids_keyval, &node_ids, &found); + if (!found) { + /* Construct an array storing node IDs of all processes. Note the + * memory allocated for node_ids will be freed by + * PNCIO_node_ids_delete(), a callback function invoked when the + * MPI communicator is freed. + */ + node_ids = (PNCIO_node_ids*) malloc(sizeof(PNCIO_node_ids)); + node_ids->ref_count = 1; + + MPI_Comm_size(comm, &nprocs); + if (nprocs == 1) { + node_ids->num_nodes = 1; + node_ids->ids = (int*) malloc(sizeof(int)); + node_ids->ids[0] = 0; + } + else { + /* Constructing node IDs requires communication calls to + * MPI_Get_processor_name(), MPI_Gather(), and MPI_Bcast(). + */ + ncmpii_construct_node_list(comm, &node_ids->num_nodes, + &node_ids->ids); + } + + /* FYI. The same key pncio_node_ids_keyval can be added to + * different MPI communicators with same or different values. + */ + MPI_Comm_set_attr(comm, pncio_node_ids_keyval, node_ids); + } + /* else case: returned node_ids contains the cached value */ + + /* copy contents */ + *node_idsP = *node_ids; + } +} + /*----< new_id_PNCList() >---------------------------------------------------*/ /* Return a new ID (array index) from the PNC list, pnc_filelist[] that is * not used. Note the used elements in pnc_filelist[] may not be contiguous. @@ -348,6 +494,7 @@ ncmpi_create(MPI_Comm comm, void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif @@ -358,6 +505,24 @@ ncmpi_create(MPI_Comm comm, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + if (rank == 0) set_env_mode(&env_mode); @@ -523,9 +688,12 @@ ncmpi_create(MPI_Comm comm, return err; } - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -542,7 +710,7 @@ ncmpi_create(MPI_Comm comm, /* calling the driver's create subroutine */ err = driver->create(pncp->comm, pncp->path, cmode, *ncidp, env_mode, - combined_info, &ncp); + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_CMODE) { @@ -591,6 +759,7 @@ ncmpi_open(MPI_Comm comm, void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif @@ -601,6 +770,24 @@ ncmpi_open(MPI_Comm comm, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + if (rank == 0) set_env_mode(&env_mode); @@ -754,9 +941,12 @@ ncmpi_open(MPI_Comm comm, err = new_id_PNCList(ncidp, pncp); if (err != NC_NOERR) return err; - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -772,7 +962,7 @@ ncmpi_open(MPI_Comm comm, /* calling the driver's open subroutine */ err = driver->open(pncp->comm, pncp->path, omode, *ncidp, env_mode, - combined_info, &ncp); + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_OMODE && diff --git a/src/drivers/common/utils.c b/src/drivers/common/utils.c index ffac366ac..362a81ed1 100644 --- a/src/drivers/common/utils.c +++ b/src/drivers/common/utils.c @@ -184,8 +184,14 @@ ncmpii_construct_node_list(MPI_Comm comm, MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, NULL, NULL, NULL, MPI_CHAR, root, comm); - /* compute node IDs of each MPI process */ - node_ids = (int *) NCI_Malloc(sizeof(int) * (nprocs + 1)); + /* node_ids is an array storing the compute node IDs of all MPI processes + * in the MPI communicator supplied by the application program. Here, we + * use malloc() instead of NCI_Malloc, because node_ids will be freed when + * the communicator is freed. When communicator is MPI_COMM_WORLD or + * MPI_COMM_SELF, it is freed at MPI_Finalize() whose calls to free() + * cannot be tracked by PnetCDF. + */ + node_ids = (int *) malloc(sizeof(int) * (nprocs + 1)); if (rank == root) { /* all_procnames[] can tell us the number of nodes and number of diff --git a/src/drivers/nc4io/nc4io_driver.h b/src/drivers/nc4io/nc4io_driver.h index 2b321210b..bf1c0b6fd 100644 --- a/src/drivers/nc4io/nc4io_driver.h +++ b/src/drivers/nc4io/nc4io_driver.h @@ -31,11 +31,11 @@ struct NC_nc4 { extern int nc4io_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); extern int nc4io_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); extern int nc4io_close(void *ncdp); diff --git a/src/drivers/nc4io/nc4io_file.c b/src/drivers/nc4io/nc4io_file.c index ddf3a6a40..d89119122 100644 --- a/src/drivers/nc4io/nc4io_file.c +++ b/src/drivers/nc4io/nc4io_file.c @@ -50,13 +50,14 @@ #include int -nc4io_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +nc4io_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; @@ -109,13 +110,14 @@ nc4io_create(MPI_Comm comm, } int -nc4io_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +nc4io_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; diff --git a/src/drivers/ncadios/ncadios_driver.h b/src/drivers/ncadios/ncadios_driver.h index 8beddbd07..28432d1e1 100644 --- a/src/drivers/ncadios/ncadios_driver.h +++ b/src/drivers/ncadios/ncadios_driver.h @@ -92,11 +92,13 @@ struct NC_ad { extern int ncadios_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncadios_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncadios_close(void *ncdp); diff --git a/src/drivers/ncadios/ncadios_file.c b/src/drivers/ncadios/ncadios_file.c index 65a25a66a..5ab07ec0c 100644 --- a/src/drivers/ncadios/ncadios_file.c +++ b/src/drivers/ncadios/ncadios_file.c @@ -47,26 +47,28 @@ #include int -ncadios_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +ncadios_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { /* Read only driver */ DEBUG_RETURN_ERROR(NC_ENOTSUPPORT); } int -ncadios_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +ncadios_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err, parse_done=0; NC_ad *ncadp; diff --git a/src/drivers/ncbbio/ncbbio_driver.h b/src/drivers/ncbbio/ncbbio_driver.h index 6c940ca83..6c60b8042 100644 --- a/src/drivers/ncbbio/ncbbio_driver.h +++ b/src/drivers/ncbbio/ncbbio_driver.h @@ -261,11 +261,13 @@ void ncbbio_export_hint (NC_bb *ncbbp, MPI_Info *info); extern int ncbbio_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncbbio_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncbbio_close (void *ncdp); diff --git a/src/drivers/ncbbio/ncbbio_file.c b/src/drivers/ncbbio/ncbbio_file.c index f6406dbf3..76f18b1f2 100644 --- a/src/drivers/ncbbio/ncbbio_file.c +++ b/src/drivers/ncbbio/ncbbio_file.c @@ -49,13 +49,14 @@ #include int -ncbbio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +ncbbio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -66,7 +67,8 @@ ncbbio_create(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->create(comm, path, cmode, ncid, env_mode, info, &ncp); + err = driver->create(comm, path, cmode, ncid, env_mode, info, node_ids, + &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */ @@ -105,13 +107,14 @@ ncbbio_create(MPI_Comm comm, } int -ncbbio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +ncbbio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -121,7 +124,7 @@ ncbbio_open(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->open(comm, path, omode, ncid, env_mode, info, &ncp); + err = driver->open(comm, path, omode, ncid, env_mode, info, node_ids, &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */ diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index dda0469f1..995a083ff 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -406,10 +406,11 @@ struct NC { int num_subfiles; /* no. subfiles */ struct NC *ncp_sf; /* ncp of subfile */ MPI_Comm comm_sf; /* subfile MPI communicator */ + PNCIO_node_ids node_ids_sf; /* node IDs of subfile MPI communicator */ #endif int hdr_chunk; /* chunk size for reading header, one chunk at a time */ int data_chunk; /* chunk size for moving variables to higher offsets */ - int striping; /* PNCIO_STRIPING_AUTO or PNCIO_STRIPING_INHERIT */ + int nc_striping; /* PNCIO_STRIPING_AUTO or PNCIO_STRIPING_INHERIT */ MPI_Offset v_align; /* alignment of the beginning of fixed-size variables */ MPI_Offset r_align; /* file alignment for record variable section */ MPI_Offset info_v_align; /* v_align set in MPI Info object */ @@ -432,16 +433,15 @@ struct NC { MPI_Offset put_size; /* amount of writes committed so far in bytes */ MPI_Offset get_size; /* amount of reads committed so far in bytes */ - MPI_Comm comm; /* MPI communicator */ - int rank; /* MPI rank of this process */ - int nprocs; /* no. MPI processes */ - int num_nodes; /* no. unique compute nodes allocated */ - int *node_ids; /* [nprocs] node IDs of each rank */ - MPI_Info mpiinfo; /* used MPI info object */ - MPI_File collective_fh; /* MPI-IO file handle for collective mode */ - MPI_File independent_fh; /* MPI-IO file handle for independent mode */ - PNCIO_File *pncio_fh; /* PNCIO file handler */ - int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ + MPI_Comm comm; /* MPI communicator */ + int rank; /* MPI rank of this process */ + int nprocs; /* no. MPI processes */ + PNCIO_node_ids node_ids; /* node IDs of each rank */ + MPI_Info mpiinfo; /* used MPI info object */ + MPI_File collective_fh; /* MPI-IO file handle for collective mode */ + MPI_File independent_fh; /* MPI-IO file handle for independent mode */ + PNCIO_File *pncio_fh; /* PNCIO file handler */ + int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ NC_dimarray dims; /* dimensions defined */ NC_attrarray attrs; /* global attributes defined */ diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c index ea8fe7a2e..9d318ffa3 100644 --- a/src/drivers/ncmpio/ncmpio_create.c +++ b/src/drivers/ncmpio/ncmpio_create.c @@ -34,13 +34,14 @@ /*----< ncmpio_create() >----------------------------------------------------*/ int -ncmpio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int rank, nprocs, mpiomode, err, mpireturn, default_format, file_exist=1; @@ -340,25 +341,27 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* initialize unlimited_id as no unlimited dimension yet defined */ ncp->dims.unlimited_id = -1; - /* Construct a list of unique IDs of compute nodes allocated to this job - * and save it in ncp->node_ids[nprocs], which contains node IDs of each - * rank. The node IDs are used either when intra-node aggregation (INA) is - * enabled or when using PnetCDF's PNCIO driver. + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. * * When intra-node aggregation (INA) is enabled, node IDs are used to * create a new MPI communicator consisting of the intra-node aggregators * only. The communicator will be used to call file open in MPI-IO or * PnetCDF's PNCIO driver. This means only intra-node aggregators will * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). */ - ncp->node_ids = NULL; - err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); - if (err != NC_NOERR) return err; + ncp->node_ids = node_ids; /* When the total number of aggregators >= number of processes, disable * intra-node aggregation. */ - if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) ncp->num_aggrs_per_node = 0; /* ncp->num_aggrs_per_node = 0, or > 0 is an indicator of whether the INA @@ -370,6 +373,12 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == ncp->ina_rank = -1; ncp->ina_node_list = NULL; if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + /* Divide all ranks into groups. Each group is assigned one intra-node * aggregator. The following metadata related to intra-node aggregation * will be set up in ncmpio_ina_init(). @@ -380,7 +389,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == * ncp->ina_comm is an MPI communicator consisting of only intra-node * aggregators across all nodes, which will be used when calling * MPI_File_open(). For non-aggregator, it == MPI_COMM_NULL. - * ncp->node_ids[] will be modified to contain the nodes IDs of all + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of all * intra-node aggregators, and will be passed to pncio_fh. */ err = ncmpio_ina_init(ncp); @@ -408,7 +417,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* If hint nc_striping is set to "auto" and hint striping_factor is not * set by the user, then set hint striping_factor to ncp->num_nodes. */ - if (ncp->striping == PNCIO_STRIPING_AUTO) { + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) { int striping_factor=0; if (user_info != MPI_INFO_NULL) { MPI_Info_get(user_info, "striping_factor", MPI_MAX_INFO_VAL-1, @@ -417,7 +426,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == striping_factor = atoi(value); } if (striping_factor == 0) { - sprintf(value, "%d", ncp->num_nodes); + sprintf(value, "%d", ncp->node_ids.num_nodes); MPI_Info_set(user_info, "striping_factor", value); } } @@ -473,7 +482,6 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1, sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = ncp->num_nodes; ncp->pncio_fh->node_ids = ncp->node_ids; err = PNCIO_File_open(comm, filename, mpiomode, user_info, @@ -567,13 +575,16 @@ if (ncp->rank == 0) { NCI_Free(ncp->ina_node_list); ncp->ina_node_list = NULL; } - /* node_ids is no longer needed */ - if (ncp->node_ids != NULL) { - NCI_Free(ncp->node_ids); - ncp->node_ids = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; } if (ncp->pncio_fh != NULL) - ncp->pncio_fh->node_ids = NULL; + ncp->pncio_fh->node_ids.ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_driver.h b/src/drivers/ncmpio/ncmpio_driver.h index f0f01ec5f..2d9924d89 100644 --- a/src/drivers/ncmpio/ncmpio_driver.h +++ b/src/drivers/ncmpio/ncmpio_driver.h @@ -13,11 +13,13 @@ extern int ncmpio_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncmpio_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncmpio_close(void *ncdp); diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c index c1c97871c..76292c088 100644 --- a/src/drivers/ncmpio/ncmpio_file_misc.c +++ b/src/drivers/ncmpio/ncmpio_file_misc.c @@ -172,9 +172,9 @@ ncmpio_begin_indep_data(void *ncdp) ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = 1; - ncp->pncio_fh->node_ids = (int*) NCI_Malloc(sizeof(int)); - ncp->pncio_fh->node_ids[0] = 0; + ncp->pncio_fh->node_ids.num_nodes = 1; + ncp->pncio_fh->node_ids.ids = (int*) NCI_Malloc(sizeof(int)); + ncp->pncio_fh->node_ids.ids[0] = 0; int omode = fClr(ncp->mpiomode, MPI_MODE_CREATE); @@ -196,8 +196,9 @@ ncmpio_begin_indep_data(void *ncdp) /* Add PnetCDF hints into ncp->mpiinfo */ ncmpio_hint_set(ncp, ncp->mpiinfo); - NCI_Free(ncp->pncio_fh->node_ids); - ncp->pncio_fh->node_ids = NULL; + NCI_Free(ncp->pncio_fh->node_ids.ids); + ncp->pncio_fh->node_ids.num_nodes = 0; + ncp->pncio_fh->node_ids.ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_intra_node.c b/src/drivers/ncmpio/ncmpio_intra_node.c index c69b6fe87..f85c54ca4 100644 --- a/src/drivers/ncmpio/ncmpio_intra_node.c +++ b/src/drivers/ncmpio/ncmpio_intra_node.c @@ -357,11 +357,12 @@ void heap_merge(int nprocs, * * The subroutine performs the following tasks. * 1. Make use of the affinity of each MPI process to its compute node, - * represented by ncp->num_nodes and ncp->node_ids[]. These two member of - * ncp should have been set from a call to ncmpii_construct_node_list() - * earlier during ncmpio_create() and ncmpio_open(). - * + ncp->num_nodes is the number of unique compute nodes. - * + ncp->node_ids[ncp->nprocs] contains node IDs for all processes. + * represented by ncp->node_ids.num_nodes and ncp->node_ids.ids[]. Note + * ncp->node_ids should have already been set from a call to + * ncmpii_construct_node_list() earlier during ncmpi_create() and + * ncmpi_open() at the dispatcher. + * + ncp->node_ids.num_nodes is the number of unique compute nodes. + * + ncp->node_ids.ids[ncp->nprocs] contains node IDs for all processes. * 2. Divide processes into groups, select aggregators, and determine whether * self process is an intra-node aggregator. * + ncp->my_aggr is rank ID of my aggregator. @@ -404,11 +405,11 @@ ncmpio_ina_init(NC *ncp) * entering this subroutine. Thus ncp->num_aggrs_per_node must be > 0. */ - /* ncp->node_ids[] has been established in ncmpii_construct_node_list() + /* ncp->node_ids.ids[] has been established in ncmpii_construct_node_list() * called in ncmpio_create() or ncmpio_open() before entering this * subroutine. my_node_id is this rank's node ID. */ - my_node_id = ncp->node_ids[ncp->rank]; + my_node_id = ncp->node_ids.ids[ncp->rank]; /* nprocs_my_node: the number of processes in my nodes * ranks_my_node[]: rank IDs of all processes in my node. @@ -419,7 +420,7 @@ ncmpio_ina_init(NC *ncp) my_rank_index = -1; nprocs_my_node = 0; for (i=0; inprocs; i++) { - if (ncp->node_ids[i] == my_node_id) { + if (ncp->node_ids.ids[i] == my_node_id) { if (i == ncp->rank) my_rank_index = nprocs_my_node; ranks_my_node[nprocs_my_node] = i; @@ -498,10 +499,10 @@ ncmpio_ina_init(NC *ncp) * so that only aggregators call MPI-IO functions to access the file. * * When using the PnetCDF's internal PNCIO driver, we can pass a list of - * node_ids of the new communicator to the PNCIO file handler, + * node IDs of the new communicator to the PNCIO file handler, * ncp->pncio_fh, so to prevent the driver from the repeated work of - * constructing the list of node IDs, node_ids. If using MPI-IO driver, - * then ROMIO will do this internally again anyway. + * constructing the list of node IDs, node_ids.ids[]. If using MPI-IO + * driver, then ROMIO will do this internally again anyway. */ do_io = (ncp->my_aggr == ncp->rank) ? 1 : 0; @@ -511,14 +512,14 @@ ncmpio_ina_init(NC *ncp) TRACE_COMM(MPI_Allgather)(&do_io, 1, MPI_INT, ncp->ina_node_list, 1, MPI_INT,ncp->comm); - /* Construct ncp->node_ids[] and ncp->ina_node_list[]. Their contents + /* Construct ncp->node_ids.ids[] and ncp->ina_node_list[]. Their contents * depend on the layout of MPI process allocation to the compute nodes. * The common layouts can be two kinds: * + cyclic - MPI ranks are assigned to nodes round-robin-ly, * + block - MPI ranks are assigned to a node and then move on to next. * * Below uses an example of nodes=3, nprocs=10, * num_aggrs_per_node=2. - * ncp->node_ids[] should be + * ncp->node_ids.ids[] should be * block process allocation: 0,0,0,0,1,1,1,2,2,2 * cyclic process allocation: 0,1,2,0,1,2,0,1,2,0 * Accordingly, ncp->ina_node_list[] can be two kinds @@ -526,7 +527,7 @@ ncmpio_ina_init(NC *ncp) * cyclic process allocation: 1,1,1,0,0,0,1,1,1,0 */ - /* ncp->node_ids[]: node IDs of processes in the new MPI communicator. + /* ncp->node_ids.ids[]: node IDs of processes in the new MPI communicator. * ncp->ina_node_list[]: the rank IDs of the new MPI communicator. */ ina_nprocs = 0; @@ -535,11 +536,11 @@ ncmpio_ina_init(NC *ncp) ina_nprocs++; /* count the total number of INA aggregators */ ncp->ina_node_list[j] = i; - /* Modify ncp->node_ids[] to store the node IDs of the processes in - * the new communicator. Note ncp->node_ids[] from now on is used - * by PnetCDF's PNCIO driver only. + /* Modify ncp->node_ids.ids[] to store the node IDs of the + * processes in the new communicator. Note ncp->node_ids.ids[] from + * now on is used by PnetCDF's PNCIO driver only. */ - ncp->node_ids[j] = ncp->node_ids[i]; + ncp->node_ids.ids[j] = ncp->node_ids.ids[i]; j++; } } diff --git a/src/drivers/ncmpio/ncmpio_open.c b/src/drivers/ncmpio/ncmpio_open.c index 5ca8e2997..c5540202a 100644 --- a/src/drivers/ncmpio/ncmpio_open.c +++ b/src/drivers/ncmpio/ncmpio_open.c @@ -32,13 +32,14 @@ /*----< ncmpio_open() >------------------------------------------------------*/ int -ncmpio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) { char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int i, rank, nprocs, mpiomode, err, status=NC_NOERR, mpireturn, flag; @@ -130,28 +131,28 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == fSet(ncp->flags, env_mode); - /* Construct a list of unique IDs of compute nodes allocated to this job - * and save it in ncp->node_ids[nprocs], which contains node IDs of each - * rank. The node IDs are used either when intra-node aggregation is - * enabled or when using PnetCDF's PNCIO driver. + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. * - * When intra-node aggregation is enabled, node IDs are used to create a - * new MPI communicator consisting of the intra-node aggregators only. The - * communicator will be used to call file open in MPI-IO or PnetCDF's PNCIO - * driver. This means only intra-node aggregators will perform file I/O in - * PnetCDF collective put and get operations. + * When intra-node aggregation (INA) is enabled, node IDs are used to + * create a new MPI communicator consisting of the intra-node aggregators + * only. The communicator will be used to call file open in MPI-IO or + * PnetCDF's PNCIO driver. This means only intra-node aggregators will + * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). */ - ncp->node_ids = NULL; - if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node != 0) { - err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); - if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + ncp->node_ids = node_ids; - /* When the total number of aggregators >= number of processes, disable - * intra-node aggregation. - */ - if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) - ncp->num_aggrs_per_node = 0; - } + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. + */ + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; /* ncp->num_aggrs_per_node = 0, or > 0 indicates whether this feature * is disabled or enabled globally for all processes. @@ -162,6 +163,12 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == ncp->ina_rank = -1; ncp->ina_node_list = NULL; if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + /* Divide all ranks into groups. Each group is assigned with one * intra-node aggregator. The following metadata related to intra-node * aggregation will be set up. @@ -172,7 +179,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == * ncp->ina_comm will be created consisting of only intra-node * aggregators, which will be used when calling MPI_File_open(). * For non-aggregator, ncp->ina_comm == MPI_COMM_NULL. - * ncp->node_ids[] will be modified to contain the nodes IDs of + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of * intra-node aggregators only, which will be passed to pncio_fh. */ err = ncmpio_ina_init(ncp); @@ -218,7 +225,6 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = ncp->num_nodes; ncp->pncio_fh->node_ids = ncp->node_ids; err = PNCIO_File_open(comm, filename, mpiomode, user_info, @@ -296,13 +302,16 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == NCI_Free(ncp->ina_node_list); ncp->ina_node_list = NULL; } - /* node_ids is no longer needed */ - if (ncp->node_ids != NULL) { - NCI_Free(ncp->node_ids); - ncp->node_ids = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; } if (ncp->pncio_fh != NULL) - ncp->pncio_fh->node_ids = NULL; + ncp->pncio_fh->node_ids.ids = NULL; /* read header from file into NC object pointed by ncp -------------------*/ err = ncmpio_hdr_get_NC(ncp); diff --git a/src/drivers/ncmpio/ncmpio_subfile.c b/src/drivers/ncmpio/ncmpio_subfile.c index 59a8ed89f..c0d0bd855 100644 --- a/src/drivers/ncmpio/ncmpio_subfile.c +++ b/src/drivers/ncmpio/ncmpio_subfile.c @@ -129,10 +129,12 @@ subfile_create(NC *ncp) MPI_Info_set(info, "romio_lustre_start_iodevice", offset); MPI_Info_set(info, "striping_factor", "1"); */ + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); void *ncp_sf; status = ncmpio_create(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, - ncp->flags, info, &ncp_sf); + ncp->flags, info, ncp->node_ids_sf, &ncp_sf); if (status != NC_NOERR && myrank == 0) fprintf(stderr, "%s: error in creating file(%s): %s\n", __func__, path_sf, ncmpi_strerror(status)); @@ -186,9 +188,12 @@ ncmpio_subfile_open(NC *ncp) /* sprintf(path_sf, "%s%d/%s", path, color, file); */ sprintf(path_sf, "%s.subfile_%i.%s", ncp->path, color, "nc"); + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); + void *ncp_sf; status = ncmpio_open(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, - ncp->flags, MPI_INFO_NULL, &ncp_sf); + ncp->flags, MPI_INFO_NULL, ncp->node_ids_sf, &ncp_sf); ncp->ncp_sf = (NC*) ncp_sf; return status; @@ -200,6 +205,9 @@ int ncmpio_subfile_close(NC *ncp) int status = NC_NOERR; if (ncp->ncp_sf != NULL) { + if (ncp->node_ids_sf.ids != NULL) + free(ncp->node_ids_sf.ids); + status = ncmpio_close(ncp->ncp_sf); if (status != NC_NOERR) return status; ncp->ncp_sf = NULL; diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c index 6961348a4..179bd7a4b 100644 --- a/src/drivers/ncmpio/ncmpio_util.c +++ b/src/drivers/ncmpio/ncmpio_util.c @@ -51,8 +51,10 @@ void ncmpio_hint_extract(NC *ncp, ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; #ifdef ENABLE_SUBFILING - ncp->subfile_mode = 0; - ncp->num_subfiles = 0; + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; + ncp->node_ids_sf.num_nodes = 0; + ncp->node_ids_sf.ids = NULL; #endif ncp->dims.hash_size = PNC_HSIZE_DIM; @@ -267,10 +269,10 @@ void ncmpio_hint_extract(NC *ncp, /* When creating a file, inherit file striping from the parent folder or * let PnetCDF to decide. */ - ncp->striping = PNCIO_STRIPING_AUTO; + ncp->nc_striping = PNCIO_STRIPING_AUTO; MPI_Info_get(info, "nc_striping", MPI_MAX_INFO_VAL-1, value, &flag); if (flag && strcasecmp(value, "inherit") == 0) - ncp->striping = PNCIO_STRIPING_INHERIT; + ncp->nc_striping = PNCIO_STRIPING_INHERIT; } /*----< ncmpio_hint_set() >--------------------------------------------------*/ @@ -384,7 +386,7 @@ void ncmpio_hint_set(NC *ncp, /* When creating a file, inherit file striping from the parent folder or * let PnetCDF to decide. */ - if (ncp->striping == PNCIO_STRIPING_AUTO) + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) MPI_Info_set(info, "nc_striping", "auto"); else MPI_Info_set(info, "nc_striping", "inherit"); diff --git a/src/drivers/pncio/pncio.h b/src/drivers/pncio/pncio.h index 3ad9156fd..267509d53 100644 --- a/src/drivers/pncio/pncio.h +++ b/src/drivers/pncio/pncio.h @@ -25,6 +25,7 @@ #define FDTYPE int #include +#include #include #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) @@ -155,9 +156,7 @@ typedef struct { int file_system; /* type of file system */ int fd_sys; /* system file descriptor */ - int num_nodes; /* number of unique compute nodes from - * MPI_Get_processor_name() */ - int *node_ids; /* [nprocs] node IDs of each rank */ + PNCIO_node_ids node_ids;/* node IDs of each rank */ int access_mode; /* Access mode (sequential, append, etc.), * possibly modified to deal with * data sieving or deferred open */ diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c index 76a0ba38b..b63e2f822 100644 --- a/src/drivers/pncio/pncio_lustre_open.c +++ b/src/drivers/pncio/pncio_lustre_open.c @@ -68,17 +68,17 @@ static int get_total_avail_osts(const char *path) { - char *dir_name=NULL, *path_dup=NULL; + char *dir_path=NULL, *path_copy=NULL; int err, ost_count=0, is_mdt=0; struct stat sb; - path_dup = NCI_Strdup(path); + path_copy = NCI_Strdup(path); - err = stat(path_dup, &sb); + err = stat(path_copy, &sb); if (errno == ENOENT) { /* file does not exist, try folder */ /* get the parent folder name */ - dir_name = dirname(path_dup); - err = stat(dir_name, &sb); + dir_path = dirname(path_copy); + err = stat(dir_path, &sb); } if (err != 0) { printf("Warning at %s (%d): path \"%s\" stat() failed (%s)\n", @@ -88,20 +88,20 @@ int get_total_avail_osts(const char *path) /* llapi_get_obd_count() only works for directories */ if (S_ISDIR(sb.st_mode)) - dir_name = (dir_name == NULL) ? path_dup : dir_name; + dir_path = (dir_path == NULL) ? path_copy : dir_path; else /* get the parent folder name */ - dir_name = dirname(path_dup); + dir_path = dirname(path_copy); - err = llapi_get_obd_count(dir_name, &ost_count, is_mdt); + err = llapi_get_obd_count(dir_path, &ost_count, is_mdt); if (err != 0) { printf("Warning at %d: path \"%s\" llapi_get_obd_count() failed (%s)\n", - __LINE__,dir_name,strerror(errno)); + __LINE__,dir_path,strerror(errno)); ost_count = 0; } err_out: - if (path_dup != NULL) NCI_Free(path_dup); + if (path_copy != NULL) NCI_Free(path_copy); return ost_count; } @@ -311,7 +311,7 @@ uint64_t get_striping(int fd, if (err != 0) { #ifdef PNETCDF_LUSTRE_DEBUG snprintf(int_str, 32, "%lu", *pattern); - printf("Error at %s (%d) llapi_layout_pattern_get() fails to get patter %s\n", + printf("Error at %s (%d) llapi_layout_pattern_get() fails to get pattern %s\n", __FILE__, __LINE__, PATTERN_STR(*pattern, int_str)); #endif goto err_out; @@ -428,7 +428,7 @@ int set_striping(const char *path, err = llapi_layout_stripe_size_set(layout, stripe_size); if (err != 0) { #ifdef PNETCDF_LUSTRE_DEBUG - printf("Error at %s (%d) llapi_layout_stripe_size_set() fails to set strpe size %lu (%s)\n", + printf("Error at %s (%d) llapi_layout_stripe_size_set() fails to set stripe size %lu (%s)\n", __FILE__, __LINE__, stripe_size, strerror(errno)); #endif goto err_out; @@ -470,7 +470,7 @@ int set_striping(const char *path, #ifdef PNETCDF_LUSTRE_DEBUG char int_str[32]; snprintf(int_str, 32, "%lu", pattern); - printf("Error at %s (%d) llapi_layout_pattern_set() fails ito set pattern %s (%s)\n", + printf("Error at %s (%d) llapi_layout_pattern_set() fails to set pattern %s (%s)\n", __FILE__, __LINE__, PATTERN_STR(pattern, int_str), strerror(errno)); #endif goto err_out; @@ -516,23 +516,23 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) MPI_Comm_rank(fd->comm, &rank); /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); - for (i=0; inode_ids[i]]++; + for (i=0; inode_ids.ids[i]]++; /* construct rank IDs of MPI processes running on each node */ - ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); - for (i=1; inum_nodes; i++) + for (i=1; inode_ids.num_nodes; i++) ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; - for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; /* Populate ranks_per_node[], list of MPI ranks running on each node. * Populate nprocs_per_node[], number of MPI processes on each node. */ for (i=0; inode_ids[i]; + k = fd->node_ids.ids[i]; ranks_per_node[k][nprocs_per_node[k]] = i; nprocs_per_node[k]++; } @@ -540,9 +540,10 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) /* To save a call to MPI_Bcast(), all processes run the same codes below to * calculate num_aggr, the number of aggregators (later becomes cb_nodes). * - * The calculation is based on the number of compute nodes, fd->num_nodes, - * and processes per node, nprocs_per_node. At this moment, all processes - * should have obtained the Lustre file striping settings. + * The calculation is based on the number of compute nodes, + * fd->node_ids.num_nodes, and processes per node, nprocs_per_node. At this + * moment, all processes should have obtained the Lustre file striping + * settings. */ striping_factor = fd->hints->striping_factor; @@ -581,11 +582,14 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) */ if (fd->hints->cb_nodes == 0) { /* User did not set hint "cb_nodes" */ - if (nprocs >= striping_factor * 8 && nprocs/fd->num_nodes >= 8) + if (nprocs >= striping_factor * 8 && + nprocs/fd->node_ids.num_nodes >= 8) num_aggr = striping_factor * 8; - else if (nprocs >= striping_factor * 4 && nprocs/fd->num_nodes >= 4) + else if (nprocs >= striping_factor * 4 && + nprocs/fd->node_ids.num_nodes >= 4) num_aggr = striping_factor * 4; - else if (nprocs >= striping_factor * 2 && nprocs/fd->num_nodes >= 2) + else if (nprocs >= striping_factor * 2 && + nprocs/fd->node_ids.num_nodes >= 2) num_aggr = striping_factor * 2; else num_aggr = striping_factor; @@ -646,10 +650,10 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) * Aggregator 3, running on node 0, access OST 7. */ int max_nprocs_node = 0; - for (i=0; inum_nodes; i++) + for (i=0; inode_ids.num_nodes; i++) max_nprocs_node = MAX(max_nprocs_node, nprocs_per_node[i]); - int max_naggr_node = striping_factor / fd->num_nodes; - if (striping_factor % fd->num_nodes) max_naggr_node++; + int max_naggr_node = striping_factor / fd->node_ids.num_nodes; + if (striping_factor % fd->node_ids.num_nodes) max_naggr_node++; /* max_naggr_node is the max number of processes per node to be picked * as aggregator in each round. */ @@ -684,7 +688,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } #endif - if (striping_factor <= fd->num_nodes) { + if (striping_factor <= fd->node_ids.num_nodes) { /* When number of OSTs is less than number of compute nodes, first * select number of nodes equal to the number of OSTs by spread the * selection evenly across all compute nodes (i.e. with a stride @@ -700,9 +704,9 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) if (block_assignment) { int n=0; int remain = num_aggr % striping_factor; - int node_stride = fd->num_nodes / striping_factor; + int node_stride = fd->node_ids.num_nodes / striping_factor; /* walk through each node and pick aggregators */ - for (j=0; jnum_nodes; j+=node_stride) { + for (j=0; jnode_ids.num_nodes; j+=node_stride) { /* Selecting node IDs with a stride. j is the node ID */ int nranks_per_node = num_aggr / striping_factor; /* front nodes may have 1 more to pick */ @@ -712,7 +716,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) /* Selecting rank IDs within node j with a stride */ fd->hints->ranklist[n] = ranks_per_node[j][k*rank_stride]; if (++n == num_aggr) { - j = fd->num_nodes; /* break loop j */ + j = fd->node_ids.num_nodes; /* break loop j */ break; /* loop k */ } } @@ -720,7 +724,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } else { int avg = num_aggr / striping_factor; - int stride = fd->num_nodes / striping_factor; + int stride = fd->node_ids.num_nodes / striping_factor; if (num_aggr % striping_factor) avg++; for (i = 0; i < num_aggr; i++) { /* j is the selected node ID. This selection is round-robin @@ -733,23 +737,23 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } } } - else { /* striping_factor > fd->num_nodes */ + else { /* striping_factor > fd->node_ids.num_nodes */ /* When number of OSTs is more than number of compute nodes, I/O * aggregators are selected from all nodes. Within each node, * aggregators are spread evenly instead of the first few ranks. */ int *naggr_per_node, *idx_per_node, avg; - idx_per_node = (int*) NCI_Calloc(fd->num_nodes, sizeof(int)); - naggr_per_node = (int*) NCI_Malloc(fd->num_nodes * sizeof(int)); - for (i = 0; i < striping_factor % fd->num_nodes; i++) - naggr_per_node[i] = striping_factor / fd->num_nodes + 1; - for (; i < fd->num_nodes; i++) - naggr_per_node[i] = striping_factor / fd->num_nodes; + idx_per_node = (int*) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); + naggr_per_node = (int*) NCI_Malloc(fd->node_ids.num_nodes * sizeof(int)); + for (i = 0; i < striping_factor % fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes + 1; + for (; i < fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes; avg = num_aggr / striping_factor; if (avg > 0) - for (i = 0; i < fd->num_nodes; i++) + for (i = 0; i < fd->node_ids.num_nodes; i++) naggr_per_node[i] *= avg; - for (i = 0; i < fd->num_nodes; i++) + for (i = 0; i < fd->node_ids.num_nodes; i++) naggr_per_node[i] = MIN(naggr_per_node[i], nprocs_per_node[i]); /* naggr_per_node[] is the number of aggregators that can be * selected as I/O aggregators @@ -757,14 +761,14 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) if (block_assignment) { int n = 0; - for (j=0; jnum_nodes; j++) { + for (j=0; jnode_ids.num_nodes; j++) { /* j is the node ID */ int rank_stride = nprocs_per_node[j] / naggr_per_node[j]; /* try stride==1 seems no effect, rank_stride = 1; */ for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; if (++n == num_aggr) { - j = fd->num_nodes; /* break loop j */ + j = fd->node_ids.num_nodes; /* break loop j */ break; /* loop k */ } } @@ -773,7 +777,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) else { for (i = 0; i < num_aggr; i++) { int stripe_i = i % striping_factor; - j = stripe_i % fd->num_nodes; /* to select from node j */ + j = stripe_i % fd->node_ids.num_nodes; /* select from node j */ k = nprocs_per_node[j] / naggr_per_node[j]; k *= idx_per_node[j]; /* try stride==1 seems no effect, k = idx_per_node[j]; */ @@ -816,7 +820,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) * 2. root sets and obtains striping info * 3. root broadcasts striping info * 4. non-root processes receive striping info from root - * 5. non-root processes opens the fie + * 5. non-root processes opens the file */ int PNCIO_Lustre_create(PNCIO_File *fd, @@ -956,7 +960,7 @@ assert(mpi_io_mode & MPI_MODE_CREATE); */ if (str_factor == 0 && (stripe_count == LLAPI_LAYOUT_DEFAULT || stripe_count == LLAPI_LAYOUT_WIDE)) { - stripe_count = MIN(fd->num_nodes, total_num_OSTs); + stripe_count = MIN(fd->node_ids.num_nodes, total_num_OSTs); if (overstriping_ratio > 1) stripe_count *= overstriping_ratio; } else if (str_factor > 0) diff --git a/src/drivers/pncio/pncio_open.c b/src/drivers/pncio/pncio_open.c index a8d1c7d6a..c7ec8539b 100644 --- a/src/drivers/pncio/pncio_open.c +++ b/src/drivers/pncio/pncio_open.c @@ -43,7 +43,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) /* If hint cb_nodes is not set by user, select one rank per node to be * an I/O aggregator */ - fd->hints->cb_nodes = fd->num_nodes; + fd->hints->cb_nodes = fd->node_ids.num_nodes; else if (fd->hints->cb_nodes > nprocs) /* cb_nodes must be <= nprocs */ fd->hints->cb_nodes = nprocs; @@ -53,23 +53,23 @@ int GEN_set_cb_node_list(PNCIO_File *fd) return NC_ENOMEM; /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); - for (i=0; inode_ids[i]]++; + for (i=0; inode_ids.ids[i]]++; /* construct rank IDs of MPI processes running on each node */ - ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); - for (i=1; inum_nodes; i++) + for (i=1; inode_ids.num_nodes; i++) ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; - for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; /* Populate ranks_per_node[], list of MPI ranks running on each node. * Populate nprocs_per_node[], number of MPI processes on each node. */ for (i=0; inode_ids[i]; + k = fd->node_ids.ids[i]; ranks_per_node[k][nprocs_per_node[k]] = i; nprocs_per_node[k]++; } @@ -81,7 +81,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) for (i=0; ihints->cb_nodes; i++) { if (j >= nprocs_per_node[k]) { /* if run out of ranks in this node k */ k++; - if (k == fd->num_nodes) { /* round-robin to first node */ + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ k = 0; j++; } @@ -92,7 +92,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) fd->is_agg = 1; fd->my_cb_nodes_index = i; } - if (k == fd->num_nodes) { /* round-robin to first node */ + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ k = 0; j++; } diff --git a/src/include/dispatch.h b/src/include/dispatch.h index 2075ac455..5251b2690 100644 --- a/src/include/dispatch.h +++ b/src/include/dispatch.h @@ -44,10 +44,18 @@ typedef enum { API_VARM } NC_api; +typedef struct { + int ref_count ; /* reference count */ + int num_nodes; /* number of unique compute nodes */ + int *ids; /* [nprocs] node ID of each MPI process */ +} PNCIO_node_ids; + struct PNC_driver { /* APIs manipulate files */ - int (*create)(MPI_Comm, const char*, int, int, int, MPI_Info, void**); - int (*open)(MPI_Comm, const char*, int, int, int, MPI_Info, void**); + int (*create)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); + int (*open)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); int (*close)(void*); int (*enddef)(void*); int (*_enddef)(void*,MPI_Offset,MPI_Offset,MPI_Offset,MPI_Offset);