diff --git a/.github/workflows/local_ci.yml b/.github/workflows/local_ci.yml index 05e47ceef..7de0dd3ac 100644 --- a/.github/workflows/local_ci.yml +++ b/.github/workflows/local_ci.yml @@ -32,7 +32,7 @@ jobs: - name: Checkout repo uses: actions/checkout@v3 with: - ref: OP2_refactor + token: ${{ github.token }} # 2. Debug info --------------------------------------- - name: Print runner info diff --git a/CHANGELOG.md b/CHANGELOG.md index 845303233..c8b2c61fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,7 @@ Major changes since `v1.1.0` (high-level). - `op_get_global_set_offset` return type changed from `int` to `idx_g_t`. - `SafeLong` debug wrapper (`op2/include/SafeLong.h`, `op2/src/core/SafeLong.cpp`): optional arithmetic overflow/underflow checker for `idx_g_t`, enabled via `-DUSE_SAFELONG`. - `op_arg_idx` / `op_arg_info` support in C (previously Fortran only); 2-dim map variant added for Fortran. -- `op_timing2`: improved timing and instrumentation API (`op2/include/op_timing2.h`). +- **`op_profile`**: tree-based timing and instrumentation API (`op2/include/op_profile.h`), replacing the interim `op_timing2` name. Functions: `op_profile_start`, `op_profile_enter`, `op_profile_enter_kernel`, `op_profile_next`, `op_profile_exit`, `op_profile_end`, `op_profile_output`, `op_profile_output_json`. Controlled by `OP_PROFILE_LEVEL` (0–3) and `OP_PROFILE_JSON_OUTPUT` environment variables. Fortran bindings provided with identical names. - `op_mpi_probe_halo_index`, `op_force_part`: new MPI utility routines. - `op_reset_data_ptr` `real(4)` variants added (Fortran). - `op_get_global_set_offset` and Fortran bindings for `op_mpi_get_data`. diff --git a/CODEBASE_OVERVIEW.md b/CODEBASE_OVERVIEW.md index 2bd3d31b5..15a5ee77f 100644 --- a/CODEBASE_OVERVIEW.md +++ b/CODEBASE_OVERVIEW.md @@ -110,7 +110,7 @@ OP2-Common/ | `op_mpi_core.h` | MPI halo data structures: `halo_list`, import/export lists, MPI comms | | `op_lib_mpi.h` | MPI runtime state: exec/non-exec halo lists, partition tables | | `op_hdf5.h` | Parallel HDF5 I/O API | -| `op_timing2.h` | Tree-based timing instrumentation (JSON output, 4 detail levels) | +| `op_profile.h` | Tree-based timing instrumentation (JSON output, 4 detail levels) | | `op_util.h` | Utility functions | | `SafeLong.h` | Debug wrapper type `SafeLong` for `idx_g_t` — detects integer overflow/underflow at runtime (enabled via `-DUSE_SAFELONG`) | | `fortran/` | Fortran C-interop headers | diff --git a/apps/c/aero/aero_hdf5/aero.cpp b/apps/c/aero/aero_hdf5/aero.cpp index db75acfcc..f8db3639c 100644 --- a/apps/c/aero/aero_hdf5/aero.cpp +++ b/apps/c/aero/aero_hdf5/aero.cpp @@ -46,6 +46,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16], // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -202,8 +203,7 @@ int main(int argc, char **argv) { ncell = op_get_size(cells); nbnodes = op_get_size(bnodes); - double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Aero"); // main time-marching loop @@ -324,8 +324,7 @@ int main(int argc, char **argv) { } } - op_timing_output(); - op_timers(&cpu_t2, &wall_t2); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); } diff --git a/apps/c/aero/aero_plain/aero.cpp b/apps/c/aero/aero_plain/aero.cpp index 7404121c9..7e46ce7cb 100644 --- a/apps/c/aero/aero_plain/aero.cpp +++ b/apps/c/aero/aero_plain/aero.cpp @@ -46,6 +46,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16], // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -263,8 +264,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); - double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Aero"); // main fixpoint iteration loop @@ -387,8 +387,7 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); } diff --git a/apps/c/aero/aero_plain/aero_mpi.cpp b/apps/c/aero/aero_plain/aero_mpi.cpp index af8fbddb1..1de71bffd 100644 --- a/apps/c/aero/aero_plain/aero_mpi.cpp +++ b/apps/c/aero/aero_plain/aero_mpi.cpp @@ -53,6 +53,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16], #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -147,7 +148,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *bnode, *cell, *g_bnode, *g_cell; double *xm, *g_xm; @@ -369,7 +369,7 @@ int main(int argc, char **argv) { niter = 20; // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Aero"); for (int iter = 1; iter <= niter; iter++) { op_par_loop(res_calc, "res_calc", cells, @@ -486,8 +486,7 @@ int main(int argc, char **argv) { } } } - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp b/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp index 161526bcf..182bad42a 100644 --- a/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp +++ b/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp @@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -83,7 +84,6 @@ int main(int argc, char **argv) { double rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); @@ -158,7 +158,7 @@ int main(int argc, char **argv) { int g_ncell = op_get_size(cells); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop @@ -240,7 +240,7 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // write given op_dat's indicated segment of data to a memory block in the // order it was originally @@ -262,7 +262,6 @@ int main(int argc, char **argv) { // compress using // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5 - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp b/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp index bad85a2a5..e8350d030 100644 --- a/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp +++ b/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp @@ -55,6 +55,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4]; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -76,7 +77,6 @@ int main(int argc, char **argv) { float rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); @@ -129,7 +129,7 @@ int main(int argc, char **argv) { int g_ncell = op_get_size(cells); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop @@ -210,9 +210,8 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp b/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp index 6353a942c..1ff154caa 100644 --- a/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp +++ b/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp @@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -79,7 +80,6 @@ int main(int argc, char **argv) { double rms, maxerr; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in grid @@ -212,7 +212,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop @@ -296,7 +296,7 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // output the result dat array to files op_print_dat_to_txtfile(p_q, "out_grid_seq.dat"); // ASCI @@ -309,8 +309,7 @@ int main(int argc, char **argv) { op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells) - 1); free(q_part); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp index 5564a18eb..102259a6c 100644 --- a/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp +++ b/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp @@ -63,6 +63,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -154,7 +155,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; @@ -164,7 +164,7 @@ int main(int argc, char **argv) { /**------------------------BEGIN I/O and PARTITIONING -------------------**/ - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); /* read in grid from disk on root processor */ FILE *fp; @@ -310,9 +310,6 @@ int main(int argc, char **argv) { free(g_res); } - op_timers(&cpu_t2, &wall_t2); - op_printf("Max total file read time = %f\n", wall_t2 - wall_t1); - /**------------------------END I/O and PARTITIONING -----------------------**/ // declare sets, pointers, datasets and global constants @@ -361,7 +358,6 @@ int main(int argc, char **argv) { op_partition("PARMETIS", "KWAY", cells, pecell, p_x); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); niter = 1000; for (int iter = 1; iter <= niter; iter++) { @@ -440,7 +436,7 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // output the result dat array to files op_print_dat_to_txtfile(p_q, "out_grid_mpi.dat"); // ASCI @@ -453,8 +449,7 @@ int main(int argc, char **argv) { op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells) - 1); free(q_part); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp b/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp index 67f287ee3..d96dbe43f 100644 --- a/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp +++ b/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp @@ -55,6 +55,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4]; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -79,7 +80,6 @@ int main(int argc, char **argv) { float rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in grid @@ -212,7 +212,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop @@ -291,9 +291,8 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp index 3561a8196..8af10ea67 100644 --- a/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp +++ b/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp @@ -63,6 +63,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4]; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -153,7 +154,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; float *x, *q, *qold, *adt, *res; @@ -163,7 +163,7 @@ int main(int argc, char **argv) { /**------------------------BEGIN I/O and PARTITIONING -------------------**/ - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); /* read in grid from disk on root processor */ FILE *fp; @@ -309,9 +309,6 @@ int main(int argc, char **argv) { free(g_res); } - op_timers(&cpu_t2, &wall_t2); - op_printf("Max total file read time = %f\n", wall_t2 - wall_t1); - /**------------------------END I/O and PARTITIONING -----------------------**/ // declare sets, pointers, datasets and global constants @@ -359,7 +356,6 @@ int main(int argc, char **argv) { op_partition("PTSCOTCH", "KWAY", NULL, pecell, p_x); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); niter = 1000; for (int iter = 1; iter <= niter; iter++) { @@ -432,7 +428,7 @@ int main(int argc, char **argv) { } } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // get results data array - perhaps can be later handled by a remporary dat // op_dat temp = op_mpi_get_data(p_q); @@ -441,9 +437,8 @@ int main(int argc, char **argv) { // print_dat_tofile(temp, "out_grid.dat"); //ASCI // print_dat_tobinfile(temp, "out_grid.bin"); //Binary - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); op_exit(); } diff --git a/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp b/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp index f4d7f39c3..e1e970fc9 100644 --- a/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp +++ b/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp @@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -79,7 +80,6 @@ int main(int argc, char **argv) { double rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in grid @@ -214,7 +214,7 @@ int main(int argc, char **argv) { double g_ncell = op_get_size(cells); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop @@ -306,9 +306,8 @@ int main(int argc, char **argv) { op_printf("Error: temporary op_dat %s cannot be removed\n", p_qold->name); } - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); } diff --git a/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp index 50558a8e0..1c16b9152 100644 --- a/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp +++ b/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp @@ -63,6 +63,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -154,7 +155,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; @@ -164,7 +164,7 @@ int main(int argc, char **argv) { /**------------------------BEGIN I/O and PARTITIONING -------------------**/ - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); /* read in grid from disk on root processor */ FILE *fp; @@ -310,9 +310,6 @@ int main(int argc, char **argv) { free(g_res); } - op_timers(&cpu_t2, &wall_t2); - op_printf("Max total file read time = %f\n", wall_t2 - wall_t1); - /**------------------------END I/O and PARTITIONING -----------------------**/ // declare sets, pointers, datasets and global constants @@ -360,7 +357,6 @@ int main(int argc, char **argv) { op_partition("PTSCOTCH", "KWAY", cells, pecell, p_x); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); niter = 1000; for (int iter = 1; iter <= niter; iter++) { @@ -446,10 +442,9 @@ int main(int argc, char **argv) { op_printf("Error: temporary op_dat %s cannot be removed\n", p_qold->name); } - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); + op_profile_end(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); op_exit(); } diff --git a/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp b/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp index fa8e02762..4fb3e1022 100644 --- a/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp +++ b/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp @@ -4,6 +4,7 @@ #include #include "op_seq.h" +#include /* Problem mesh and iterations */ #define FILE_NAME_PATH "new_grid.h5" @@ -33,7 +34,6 @@ int main(int argc, char **argv) { double rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // Load unstructured mesh op_printf("***** Load mesh and initialization *****\n"); @@ -86,7 +86,7 @@ int main(int argc, char **argv) { op_partition("BLOCK", "ANY", edges, pecell, p_x); //start timer - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop op_printf("***** Start Main iteration *************\n"); @@ -158,12 +158,8 @@ int main(int argc, char **argv) { } //end timer - op_timers(&cpu_t2, &wall_t2); - - // compute and print wall time - double walltime = wall_t2 - wall_t1; - - op_printf(" Wall time %lf \n", walltime); + op_profile_end(); + op_profile_output(); //Finalising the OP2 library op_exit(); diff --git a/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp b/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp index e7e64af1e..4cbdbb17f 100644 --- a/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp +++ b/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp @@ -4,6 +4,7 @@ #include #include "op_seq.h" +#include /* Problem mesh and iterations */ #define FILE_NAME_PATH "new_grid.h5" @@ -14,7 +15,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4]; /* wall timer routine */ -// Now done using OP2's internal op_timers() call +// Now done using OP2's internal op_profile API //outlined elemental kernel - save_soln inline void save_soln(const double *q, double *qold) { @@ -150,7 +151,6 @@ int main(int argc, char **argv) { double rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // Load unstructured mesh op_printf("***** Load mesh and initialization *****\n"); @@ -203,7 +203,7 @@ int main(int argc, char **argv) { op_partition("BLOCK", "ANY", edges, pecell, p_x); //start timer - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop op_printf("***** Start Main iteration *************\n"); @@ -275,12 +275,8 @@ int main(int argc, char **argv) { } //end timer - op_timers(&cpu_t2, &wall_t2); - - // compute and print wall time - double walltime = wall_t2 - wall_t1; - - op_printf(" Wall time %lf \n", walltime); + op_profile_end(); + op_profile_output(); //Finalising the OP2 library op_exit(); diff --git a/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp b/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp index fa8e02762..4fb3e1022 100644 --- a/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp +++ b/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp @@ -4,6 +4,7 @@ #include #include "op_seq.h" +#include /* Problem mesh and iterations */ #define FILE_NAME_PATH "new_grid.h5" @@ -33,7 +34,6 @@ int main(int argc, char **argv) { double rms; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // Load unstructured mesh op_printf("***** Load mesh and initialization *****\n"); @@ -86,7 +86,7 @@ int main(int argc, char **argv) { op_partition("BLOCK", "ANY", edges, pecell, p_x); //start timer - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Airfoil"); // main time-marching loop op_printf("***** Start Main iteration *************\n"); @@ -158,12 +158,8 @@ int main(int argc, char **argv) { } //end timer - op_timers(&cpu_t2, &wall_t2); - - // compute and print wall time - double walltime = wall_t2 - wall_t1; - - op_printf(" Wall time %lf \n", walltime); + op_profile_end(); + op_profile_output(); //Finalising the OP2 library op_exit(); diff --git a/apps/c/jac1/dp/jac.cpp b/apps/c/jac1/dp/jac.cpp index 8f5c52057..af84817cc 100644 --- a/apps/c/jac1/dp/jac.cpp +++ b/apps/c/jac1/dp/jac.cpp @@ -56,6 +56,7 @@ double alpha; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -80,7 +81,6 @@ int main(int argc, char **argv) { op_init(argc, argv, 5); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int nnode, nedge, n, e; @@ -153,7 +153,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC"); // main iteration loop @@ -181,7 +181,7 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // print out results op_printf("\n Results after %d iterations:\n\n", NITER); @@ -203,10 +203,9 @@ int main(int argc, char **argv) { op_printf("\n"); } - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); int result = check_result(u, NN, TOLERANCE); op_exit(); diff --git a/apps/c/jac1/dp/jac_mpi.cpp b/apps/c/jac1/dp/jac_mpi.cpp index 9fea6854a..77cd9e620 100644 --- a/apps/c/jac1/dp/jac_mpi.cpp +++ b/apps/c/jac1/dp/jac_mpi.cpp @@ -63,6 +63,7 @@ double alpha; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -151,7 +152,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *pp; double *A, *r, *u, *du; @@ -274,7 +274,7 @@ int main(int argc, char **argv) { op_partition("PARMETIS", "KWAY", edges, ppedge, NULL); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC"); // main iteration loop @@ -304,7 +304,7 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / g_nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // get results data array op_fetch_data(p_u, u); @@ -321,10 +321,9 @@ int main(int argc, char **argv) { printf("\n"); // print each mpi process's timing info for each kernel - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); // gather results from all ranks and check double *ug = (double *)malloc(sizeof(double) * op_get_size(nodes)); diff --git a/apps/c/jac1/longint/jac_mpi.cpp b/apps/c/jac1/longint/jac_mpi.cpp index eae5cd361..8cd312f46 100644 --- a/apps/c/jac1/longint/jac_mpi.cpp +++ b/apps/c/jac1/longint/jac_mpi.cpp @@ -59,6 +59,7 @@ double alpha; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -185,7 +186,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; idx_g_t *pp; double *A, *r, *u, *du; @@ -330,7 +330,7 @@ remains consistent. */ op_partition("PARMETIS", "KWAY", edges, ppedge, p_coords); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC"); // main iteration loop @@ -360,7 +360,7 @@ remains consistent. */ op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / (double)(size_t)g_nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // output the result dat array to files // op_print_dat_to_txtfile(p_u, "out_grid_mpi.dat"); // ASCI @@ -374,10 +374,9 @@ remains consistent. */ // printf("\n"); // print each mpi process's timing info for each kernel - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); // fetch local results op_fetch_data(p_u, u); diff --git a/apps/c/jac1/sp/jac.cpp b/apps/c/jac1/sp/jac.cpp index 9551fb290..6deb4b7a2 100644 --- a/apps/c/jac1/sp/jac.cpp +++ b/apps/c/jac1/sp/jac.cpp @@ -56,6 +56,7 @@ float alpha; // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -80,7 +81,6 @@ int main(int argc, char **argv) { op_init(argc, argv, 5); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int nnode, nedge, n, e; @@ -153,7 +153,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC"); // main iteration loop @@ -177,7 +177,7 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // print out results @@ -200,10 +200,9 @@ int main(int argc, char **argv) { op_printf("\n"); } - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); int result = check_result(u, NN, TOLERANCE); op_exit(); diff --git a/apps/c/jac1/sp/jac_mpi.cpp b/apps/c/jac1/sp/jac_mpi.cpp index 575daade7..34e76b9cf 100644 --- a/apps/c/jac1/sp/jac_mpi.cpp +++ b/apps/c/jac1/sp/jac_mpi.cpp @@ -63,6 +63,7 @@ float alpha; #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -150,7 +151,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *pp; float *A, *r, *u, *du; @@ -273,7 +273,7 @@ int main(int argc, char **argv) { op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC"); // main iteration loop @@ -298,7 +298,7 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / g_nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // get results data array op_fetch_data(p_u, u); @@ -315,10 +315,9 @@ int main(int argc, char **argv) { printf("\n"); // print each mpi process's timing info for each kernel - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); // gather results from all ranks and check float *ug = (float *)malloc(sizeof(float) * op_get_size(nodes)); diff --git a/apps/c/jac2/jac.cpp b/apps/c/jac2/jac.cpp index f7a2cc918..aeed884a8 100644 --- a/apps/c/jac2/jac.cpp +++ b/apps/c/jac2/jac.cpp @@ -52,6 +52,7 @@ float alpha; // #include "op_seq.h" +#include // jac header file @@ -81,7 +82,6 @@ int main(int argc, char **argv) { op_init(argc, argv, 5); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int nnode, nedge, n, e; float dx; @@ -171,7 +171,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC2"); // main iteration loop @@ -195,7 +195,7 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); // print out results op_printf("\n Results after %d iterations:\n\n", NITER); @@ -218,10 +218,9 @@ int main(int argc, char **argv) { op_printf("\n"); } - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); int result = check_result(u, NN, TOLERANCE); op_exit(); diff --git a/apps/c/jac2/jac_mpi.cpp b/apps/c/jac2/jac_mpi.cpp index d8941a611..21c9431e4 100644 --- a/apps/c/jac2/jac_mpi.cpp +++ b/apps/c/jac2/jac_mpi.cpp @@ -57,6 +57,7 @@ float alpha; #include "op_lib_mpi.h" #include "op_seq.h" +#include // jac header file @@ -171,7 +172,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; /**------------------------BEGIN I/O and PARTITIONING ---------------------**/ int g_nnode, g_nedge, g_n, g_e; @@ -298,7 +298,7 @@ int main(int argc, char **argv) { op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("JAC2"); // main iteration loop @@ -322,15 +322,14 @@ int main(int argc, char **argv) { op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode)); } - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); op_fetch_data(p_u, u); op_print_dat_to_txtfile(p_u, "out_grid_mpi.dat"); - op_timing_output(); + op_profile_output(); // print total time for niter interations - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); float *ug = (float *)malloc(sizeof(float) * op_get_size(nodes) * 2); op_fetch_data_idx(p_u, ug, 0, op_get_size(nodes) - 1); diff --git a/apps/c/reduction/reduction.cpp b/apps/c/reduction/reduction.cpp index f3d3f804f..5eebfe9fe 100644 --- a/apps/c/reduction/reduction.cpp +++ b/apps/c/reduction/reduction.cpp @@ -48,6 +48,7 @@ // #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -68,7 +69,6 @@ int main(int argc, char **argv) { int nnode, ncell, nedge, nbedge; // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in airfoil grid @@ -145,7 +145,7 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Reduction"); // indirect reduction count1 = 0; @@ -173,10 +173,8 @@ int main(int argc, char **argv) { else op_printf("Reduction application FAILED\n"); - op_timers(&cpu_t2, &wall_t2); - op_timing_output(); - - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_end(); + op_profile_output(); op_exit(); diff --git a/apps/c/reduction/reduction_mpi.cpp b/apps/c/reduction/reduction_mpi.cpp index 7918bca4d..eeb2ba44f 100644 --- a/apps/c/reduction/reduction_mpi.cpp +++ b/apps/c/reduction/reduction_mpi.cpp @@ -51,6 +51,7 @@ #include "op_lib_mpi.h" #include "op_seq.h" +#include // // kernel routines for parallel loops @@ -139,7 +140,6 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer - double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; @@ -148,7 +148,7 @@ int main(int argc, char **argv) { /**------------------------BEGIN I/O and PARTITIONING -------------------**/ - op_timers(&cpu_t1, &wall_t1); + op_profile_start("Reduction"); /* read in grid from disk on root processor */ FILE *fp; @@ -267,9 +267,6 @@ int main(int argc, char **argv) { free(g_res); } - op_timers(&cpu_t2, &wall_t2); - op_printf("Max total file read time = %f\n", wall_t2 - wall_t1); - /**------------------------END I/O and PARTITIONING -----------------------**/ op_set edges = op_decl_set(nedge, "edges"); @@ -286,7 +283,6 @@ int main(int argc, char **argv) { op_diagnostic_output(); // initialise timers for total execution wall time - op_timers(&cpu_t1, &wall_t1); // indirect reduction count = 0; @@ -309,10 +305,9 @@ int main(int argc, char **argv) { else op_printf("direct reduction PASSED\n"); - op_timers(&cpu_t2, &wall_t2); + op_profile_end(); - op_timing_output(); - op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); + op_profile_output(); op_exit(); diff --git a/apps/fortran/airfoil/airfoil.F90 b/apps/fortran/airfoil/airfoil.F90 index db2d47fba..d7d160fae 100644 --- a/apps/fortran/airfoil/airfoil.F90 +++ b/apps/fortran/airfoil/airfoil.F90 @@ -63,7 +63,7 @@ program airfoil #endif call op_init_base(0, 0) - call op_timing2_start("Airfoil") + call op_profile_start("Airfoil") #ifdef HDF5 call op_print("Declaring OP2 sets (HDF5)") @@ -121,7 +121,7 @@ program airfoil call op_decl_const(qinf, 4, "real(8)") call op_partition("PARMETIS", "KWAY", edges, pecell, p_x) - call op_timing2_enter("Main computation") + call op_profile_enter("Main computation") call op_decl_dat_temp(cells, 4, "real(8)", p_res, "p_res") @@ -184,10 +184,10 @@ program airfoil iter = op_free_dat_temp(p_res) - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() if (op_is_root() == 1 .and. niter == 1000 .and. ncell_total == 720000) then diff = abs((100.0_8 * (rms(2) / 0.0001060114637578_8)) - 100.0_8) diff --git a/apps/fortran/jac1/jac.F90 b/apps/fortran/jac1/jac.F90 index 5ff5f4985..33fe0b7ac 100644 --- a/apps/fortran/jac1/jac.F90 +++ b/apps/fortran/jac1/jac.F90 @@ -58,7 +58,7 @@ program jac alpha = 1.0 call op_decl_const(alpha, 1, "real(8)") - call op_timing2_start("JAC") + call op_profile_start("JAC") beta = 1.0 @@ -82,10 +82,10 @@ program jac write (*, "(1X, A, F7.4, A, F10.8)") "u max = ", u_max, "; u rms = ", sqrt(u_sum / nnode) end do - call op_timing2_finish() + call op_profile_end() print * - call op_timing2_output() + call op_profile_output() allocate(u(nnode)) call op_fetch_data(p_u, u) diff --git a/apps/fortran/jac1_long/jac1_mpi.F90 b/apps/fortran/jac1_long/jac1_mpi.F90 index 3182e53dc..9c19bb380 100644 --- a/apps/fortran/jac1_long/jac1_mpi.F90 +++ b/apps/fortran/jac1_long/jac1_mpi.F90 @@ -191,8 +191,8 @@ program jac_distributed !-------------------------------------------------------------------------- ! 5. Main Iteration Loop !-------------------------------------------------------------------------- - call op_timing2_start("Jacobi") - call op_timing2_enter("Main computation") ! Start timing after setup/partitioning + call op_profile_start("Jacobi") + call op_profile_enter("Main computation") ! Start timing after setup/partitioning beta = 1.0_8 @@ -227,12 +227,12 @@ program jac_distributed end if end do - call op_timing2_finish() ! Stop timing + call op_profile_end() ! Stop timing !-------------------------------------------------------------------------- ! 6. Output Timings and Fetch Results !-------------------------------------------------------------------------- - call op_timing2_output() + call op_profile_output() ! Re-allocate u if it was deallocated earlier, or just use the existing one ! Ensure 'u' is allocated with the correct *local* size 'nnode' diff --git a/apps/fortran/reduction/reduction.F90 b/apps/fortran/reduction/reduction.F90 index 32fac0cec..3709e641b 100644 --- a/apps/fortran/reduction/reduction.F90 +++ b/apps/fortran/reduction/reduction.F90 @@ -29,8 +29,6 @@ program reduction type(op_map) :: pecell type(op_dat) :: p_res, p_dummy - real(kind = c_double) :: start_time, end_time - integer(4) :: i, cell_count_result, edge_count_result #ifndef HDF5 @@ -41,7 +39,7 @@ program reduction #endif call op_init_base(0, 0) - call op_timing2_start("Reduction") + call op_profile_start("Reduction") #ifndef HDF5 open(file_id, file = file_name) @@ -90,7 +88,6 @@ program reduction #endif call op_partition("PTSCOTCH", "KWAY", edges, pecell, p_dummy) - call op_timers(start_time) ncell_total = op_get_size(cells) nedge_total = op_get_size(edges) @@ -106,8 +103,6 @@ program reduction op_arg_dat(p_res, 1, pecell, 4, "real(8)", OP_RW), & op_arg_gbl(edge_count_result, 1, "integer(4)", OP_INC)) - call op_timers(end_time) - call op_timing_output() if (op_is_root() == 1) then print * @@ -122,13 +117,12 @@ program reduction end if print * - print *, 'Time = ', end_time - start_time, 'seconds' end if - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/docs/api.rst b/docs/api.rst index 96958cdaa..95e20e1b4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -445,6 +445,67 @@ Other I/O and Utilities :param file_name: The name of the CSV file to write. +Profiling API +^^^^^^^^^^^^^ + +The ``op_profile`` API (``op2/include/op_profile.h``) provides a tree-based timing and instrumentation facility for OP2 applications. It replaces the older ``op_timers`` / ``op_timing_output`` pattern with a structured, hierarchical timing tree that can be printed to stdout or exported to JSON. + +The translator-generated kernel code automatically inserts ``op_profile_enter_kernel`` calls; application code uses the simpler ``op_profile_start`` / ``op_profile_enter`` / ``op_profile_end`` functions to time outer sections. + +Typical usage:: + + op_profile_start("MyApp"); // initialise, name the root + op_profile_enter("Setup"); // begin a named section + /* ... setup work ... */ + op_profile_next("Computation"); // exit "Setup", enter "Computation" + /* ... main loop ... */ + op_profile_end(); // close all open sections + op_profile_output(); // print summary; write JSON if OP_PROFILE_JSON_OUTPUT is set + +.. c:function:: void op_profile_start(const char *name) + + Initialise the profiling system and set the root application name. Must be called before any other ``op_profile_*`` function. Reads ``OP_PROFILE_LEVEL`` from the environment at this point. + + :param name: A descriptive name for the application (used as the root label in the timing tree). + +.. c:function:: void op_profile_enter(const char *name) + + Begin a named timing section, pushing it onto the timing stack. Device synchronisation is performed before the timer starts (for accurate GPU timings). + + :param name: A descriptive name for the section. + +.. c:function:: void op_profile_enter_kernel(const char *name, const char *target, const char *variant) + + Enter a kernel-level timing section. Called automatically by translator-generated kernel code; do not call manually in application code. + + :param name: Kernel name. + :param target: Backend target string (e.g. ``"cuda"``, ``"openmp"``). + :param variant: Kernel variant string. + +.. c:function:: void op_profile_next(const char *name) + + Convenience helper equivalent to calling :c:func:`op_profile_exit` followed immediately by :c:func:`op_profile_enter` with the new name. + + :param name: The name of the next section to enter. + +.. c:function:: void op_profile_exit() + + Close the innermost open timing section, recording elapsed time. + +.. c:function:: void op_profile_end() + + End profiling, closing all remaining open sections. + +.. c:function:: void op_profile_output() + + Pretty-print the timing tree to stdout. If the ``OP_PROFILE_JSON_OUTPUT`` environment variable is set, also writes the tree to that file in JSON format. For MPI builds, timing trees are collected from all ranks and combined before printing. + +.. c:function:: void op_profile_output_json(const char *filename) + + Write the timing tree to *filename* in JSON format. For MPI builds, trees from all ranks are combined first. + + :param filename: Path to the output JSON file. + .. c:function:: void op_diagnostic_output() This routine prints diagnostics relating to sets, mappings and datasets. @@ -468,6 +529,10 @@ The following environment variables can be set at run time to control OP2 behavi - Integer. Caps the maximum number of OpenMP threads used inside JIT-compiled (``c_cuda`` / ``c_hip``) host loops. Useful for tuning thread counts independently of ``OMP_NUM_THREADS``. * - ``OP_FALLBACK_MODE`` - Set to ``warn`` or ``error``. When the translator emits a fallback sequential kernel for a loop it cannot fully parallelise, ``warn`` prints a warning at runtime when the fallback executes; ``error`` aborts. Unset by default (fallback runs silently). + * - ``OP_PROFILE_LEVEL`` + - Controls the detail level of the ``op_profile`` timing tree. ``0`` disables profiling entirely. ``1`` times only user-defined outer sections (no kernel timing). ``2`` adds whole-kernel timings (default). ``3`` includes detailed in-kernel section timings generated by the translator. + * - ``OP_PROFILE_JSON_OUTPUT`` + - If set to a file path, :c:func:`op_profile_output` and :c:func:`op_profile_output_json` will write the timing tree to that file in JSON format. ---- @@ -629,8 +694,8 @@ Fortran application variants are prefixed with ``f_`` in the Make build system: For the translator invocation for Fortran sources, see :doc:`translator`. -Timers -^^^^^^ +Timers and Profiling +^^^^^^^^^^^^^^^^^^^^ .. code-block:: fortran @@ -643,3 +708,17 @@ Timers .. note:: Unlike the C/C++ :c:func:`op_timers`, the Fortran version takes only one argument (the wall-clock time ``et``); the ``cpu`` argument is omitted. + +The ``op_profile`` profiling API is also available in Fortran with identical semantics to the C/C++ interface: + +.. code-block:: fortran + + call op_profile_start("MyApp") + call op_profile_enter("Setup") + ! ... setup work ... + call op_profile_next("Computation") + ! ... main loop ... + call op_profile_end() + call op_profile_output() + +The available Fortran subroutines mirror the C API: ``op_profile_start``, ``op_profile_enter``, ``op_profile_enter_kernel``, ``op_profile_next``, ``op_profile_exit``, ``op_profile_end``, ``op_profile_output``, and ``op_profile_output_json``. The ``OP_PROFILE_LEVEL`` and ``OP_PROFILE_JSON_OUTPUT`` environment variables apply in Fortran builds identically to C/C++ builds. diff --git a/op2/Makefile b/op2/Makefile index 648197902..9f670d7a5 100644 --- a/op2/Makefile +++ b/op2/Makefile @@ -111,7 +111,7 @@ OP2_FOR_HDF5 := $(OP2_HDF5) $(addprefix $(OBJ)/fortran/,\ OP2_SEQ := $(OP2_BASE) $(addprefix $(OBJ)/,\ core/op_dummy_singlenode.o \ sequential/op_seq.o \ - externlib/op_timing2.o) + externlib/op_profile.o) OP2_FOR_SEQ := $(OP2_SEQ) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\ op_dummy_wrappers.o) @@ -120,13 +120,13 @@ OP2_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\ cuda/op_cuda_decl+cuda.o \ cuda/op_cuda_rt_support+cuda.o \ cuda/op2_cuda_rt_wrappers+cuda.o \ - externlib/op_timing2.o) + externlib/op_profile.o) OP2_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\ cuda/op_cuda_decl+hip.o \ cuda/op_cuda_rt_support+hip.o \ cuda/op2_cuda_rt_wrappers+hip.o \ - externlib/op_timing2.o) + externlib/op_profile.o) OP2_FOR_CUDA := $(OP2_CUDA) $(OP2_FOR_BASE_CUDA) $(addprefix $(OBJ)/fortran/,\ cudaConfigurationParams.o) @@ -137,7 +137,7 @@ OP2_FOR_HIP := $(OP2_HIP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\ OP2_OPENMP := $(OP2_BASE) $(addprefix $(OBJ)/,\ core/op_dummy_singlenode.o \ openmp/op_openmp_decl.o \ - externlib/op_timing2.o) + externlib/op_profile.o) OP2_FOR_OPENMP := $(OP2_OPENMP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\ op_dummy_wrappers.o) @@ -145,7 +145,7 @@ OP2_FOR_OPENMP := $(OP2_OPENMP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\ OP2_OPENMP4 := $(OP2_BASE) $(addprefix $(OBJ)/,\ openmp4/op_openmp4_decl.o \ openmp4/op_openmp4_rt_support.o \ - externlib/op_timing2.o) + externlib/op_profile.o) OP2_FOR_OPENMP4 := $(OP2_OPENMP4) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\ op_dummy_wrappers.o) @@ -159,7 +159,7 @@ OP2_MPI := $(OP2_BASE) $(addprefix $(OBJ)/,\ mpi/op_mpi_util.o \ externlib/op_util.o \ externlib/op_renumber.o \ - externlib/op_timing2+mpi.o) + externlib/op_profile+mpi.o) OP2_FOR_MPI := $(OP2_MPI) $(OP2_FOR_BASE_MPI) $(addprefix $(OBJ)/fortran/,\ op_dummy_wrappers+mpi.o) @@ -176,7 +176,7 @@ OP2_MPI_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\ mpi/op_mpi_util.o \ externlib/op_util.o \ externlib/op_renumber.o \ - externlib/op_timing2+mpi.o) + externlib/op_profile+mpi.o) OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\ cuda/op_cuda_rt_support+mpi+hip.o \ @@ -190,7 +190,7 @@ OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\ mpi/op_mpi_util.o \ externlib/op_util.o \ externlib/op_renumber.o \ - externlib/op_timing2+mpi.o) + externlib/op_profile+mpi.o) OP2_FOR_MPI_CUDA := $(OP2_MPI_CUDA) $(OP2_FOR_BASE_MPI_CUDA) $(addprefix $(OBJ)/fortran/,\ cudaConfigurationParams.o) diff --git a/op2/include/op_f2c_helpers.h b/op2/include/op_f2c_helpers.h index 077597826..fbf8888f8 100644 --- a/op2/include/op_f2c_helpers.h +++ b/op2/include/op_f2c_helpers.h @@ -1,7 +1,7 @@ #pragma once #include -// #include +#include #include #include @@ -528,13 +528,13 @@ class KernelInfo { void invoke(JitKernel *kernel, int num_blocks, int block_size, void **args, void **args_jit) { if (kernel == nullptr) { - // op_timing2_next("Offline Kernel"); + op_profile_next("Offline Kernel"); invoke_offline(num_blocks, block_size, args); return; } - // op_timing2_next("JIT Kernel"); + op_profile_next("JIT Kernel"); kernel->invoke(num_blocks, block_size, args_jit); } }; diff --git a/op2/include/op_timing2.h b/op2/include/op_profile.h similarity index 66% rename from op2/include/op_timing2.h rename to op2/include/op_profile.h index b9be64cdf..14ba32a3c 100644 --- a/op2/include/op_timing2.h +++ b/op2/include/op_profile.h @@ -1,5 +1,5 @@ -#ifndef __OP_TIMING2_H -#define __OP_TIMING2_H +#ifndef __OP_PROFILE_H +#define __OP_PROFILE_H #include @@ -13,34 +13,34 @@ using json = nlohmann::json; /* - * Tree-based timing code for instrumentation of OP2 applications. See the op_timing2 class (and its C API) for the + * Tree-based timing code for instrumentation of OP2 applications. See the op_profile class (and its C API) for the * public API methods to use. * * The OP2 code-generator will generate calls in kernel code for _enter_kernel() and sections inside. * * Expected usage from application: - * - op_timing2_start(name) - * - op_timing2_enter(name) (Optional - use to time application sections and separate out kernels) - * - op_timing2_exit() + * - op_profile_start(name) + * - op_profile_enter(name) (Optional - use to time application sections and separate out kernels) + * - op_profile_exit() * - ... - * - op_timing2_finish() - * - op_timing2_output() + * - op_profile_end() + * - op_profile_output() * * Two environment variables can be used: - * - OP_TIMING2_LEVEL={0,1,2,3} - Set the timing detail level + * - OP_PROFILE_LEVEL={0,1,2,3} - Set the timing detail level * - 0: Disabled * - 1: Only time outer sections from _enter() in the application code (no kernel timing) * - 2: Time outer sections from application code and overall kernel timings (no in-kernel timing) * - 3: Time outer sections and detailed kernel timings (from code-generated _enter() sections) * - * - OP_TIMING2_JSON_OUTPUT= - Output the timing tree to the specified file in JSON format during the - * call to op_timing2_output(). + * - OP_PROFILE_JSON_OUTPUT= - Output the timing tree to the specified file in JSON format during the + * call to op_profile_output(). */ -/* ----------------------------------------- op_timing2_clock ----------------------------------------- */ +/* ----------------------------------------- op_profile_clock ----------------------------------------- */ /* Helper struct to hold the timing information for each tree node */ -struct op_timing2_clock { +struct op_profile_clock { using clock = std::chrono::high_resolution_clock; std::size_t n = 0; @@ -50,75 +50,75 @@ struct op_timing2_clock { clock::duration max; void submit(clock::duration duration); - op_timing2_clock& operator+=(const op_timing2_clock& other); + op_profile_clock& operator+=(const op_profile_clock& other); clock::duration average() const { return total / n; } }; /* Format a duration auto-selecting units (s, ms, us, ns) into the specified string width */ -std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned width = 7); +std::string format_duration(const op_profile_clock::clock::duration& d, unsigned width = 7); -/* Convert a op_timing2_clock to a string, optionally with a parent clock reference that is used to calculate a +/* Convert a op_profile_clock to a string, optionally with a parent clock reference that is used to calculate a * time percentage. The parent needs not be the actual parent of the node in the timing tree. */ -std::string to_string(const op_timing2_clock& clock, - const std::optional> parent = std::nullopt); +std::string to_string(const op_profile_clock& clock, + const std::optional> parent = std::nullopt); /* JSON conversion implementations */ -void to_json(json& j, const op_timing2_clock& clock); -void from_json(const json& j, op_timing2_clock& clock); +void to_json(json& j, const op_profile_clock& clock); +void from_json(const json& j, op_profile_clock& clock); -/* ----------------------------------------- op_timing2_node ----------------------------------------- */ +/* ----------------------------------------- op_profile_node ----------------------------------------- */ /* Node type for the timing tree nodes */ -enum class op_timing2_node_type { standard, kernel }; +enum class op_profile_node_type { standard, kernel }; /* Timing tree node */ -struct op_timing2_node { +struct op_profile_node { std::string name; - op_timing2_node_type type = op_timing2_node_type::standard; - op_timing2_clock clock; + op_profile_node_type type = op_profile_node_type::standard; + op_profile_clock clock; std::size_t num_ranks = 1; - std::vector children; + std::vector children; - op_timing2_node(): name{"unknown"} {} - op_timing2_node(std::string_view name): name{name} {} + op_profile_node(): name{"unknown"} {} + op_profile_node(std::string_view name): name{name} {} /* Check if the node has an immediate child of the given name (and type) */ bool has_child(std::string_view name, - std::optional child_type = std::nullopt); + std::optional child_type = std::nullopt); /* Get an immediate child of the node with the given name (and type). This will create a new node if a match is * not found. Child names must be unique, regardless of type */ - op_timing2_node& get_child(std::string_view name, - std::optional child_type = std::nullopt); + op_profile_node& get_child(std::string_view name, + std::optional child_type = std::nullopt); /* Get a child of the node with the given scope path (and type). The scope must contain at least one name, and * children will be created on demand if they do not exist. */ - op_timing2_node& get_child(std::vector scope, - std::optional child_type = std::nullopt); + op_profile_node& get_child(std::vector scope, + std::optional child_type = std::nullopt); /* Combine with another node, probably from another MPI rank, adding the timing statistics together. Children * present in the second node will be created and combined into this node */ - op_timing2_node& operator+=(const op_timing2_node& other); + op_profile_node& operator+=(const op_profile_node& other); /* Pretty-print the node to stdout, with an optional parent for time percentage output from the clocks. The parent * needs not be the node's actual parent */ void output(unsigned indent = 0, - const std::optional> parent = std::nullopt); + const std::optional> parent = std::nullopt); }; /* JSON conversion implementations */ -void to_json(json& j, const op_timing2_node& node); -void from_json(const json& j, op_timing2_node& node); +void to_json(json& j, const op_profile_node& node); +void from_json(const json& j, op_profile_node& node); -/* ----------------------------------------- op_timing2 ----------------------------------------- */ +/* ----------------------------------------- op_profile ----------------------------------------- */ /* Timing detail level, set before timing init/start */ -enum class op_timing2_level { +enum class op_profile_level { disabled = 0, // No timing simple, // Only user-defined outer sections, doesn't time kernels kernel, // Includes whole-kernel timing, no in-kernel sections (default) @@ -126,15 +126,15 @@ enum class op_timing2_level { }; /* The timing class, instantiated as a singleton "timing" - interact through that */ -class op_timing2 { +class op_profile { private: - op_timing2_level level = op_timing2_level::simple; + op_profile_level level = op_profile_level::simple; - std::vector> current_scope; - std::vector current_starts; + std::vector> current_scope; + std::vector current_starts; unsigned extra_depth = 0; // Depth into sections not enabled by the current level - op_timing2_node root; + op_profile_node root; bool started = false; @@ -143,10 +143,10 @@ class op_timing2 { public: /* Returns the singleton timing instance */ - static op_timing2& instance(); + static op_profile& instance(); - /* Sets the timing level (only before calling start()) - see op_timing2_level */ - void set_level(const op_timing2_level new_level); + /* Sets the timing level (only before calling start()) - see op_profile_level */ + void set_level(const op_profile_level new_level); /* Initialises timing (with application name), required unless level set to disabled */ void start(std::string_view name); @@ -166,7 +166,7 @@ class op_timing2 { /* End timing, closing all remaining open sections. */ void finish(); - /* Pretty-print timing statistics, and output JSON to OP_TIMING2_JSON_OUTPUT if it's defined. The timing trees + /* Pretty-print timing statistics, and output JSON to OP_PROFILE_JSON_OUTPUT if it's defined. The timing trees * will be combined across MPI ranks the first time this is called if needed. */ void output(); @@ -184,7 +184,7 @@ class op_timing2 { void print_summary(); /* Pretty-prints the non-kernel nodes, accumulating a list of nodes that have immediate kernel children */ - void print_walk_non_kernel(const op_timing2_node& node, + void print_walk_non_kernel(const op_profile_node& node, const std::vector& parent_path, std::vector>& nodes_with_kernels, unsigned indent = 0); @@ -193,21 +193,21 @@ class op_timing2 { void print_kernel_summary(const std::vector& path, unsigned longest_name); }; -/* C/Fortran timing API functions wrapping the public op_timing2 methods */ +/* C/Fortran timing API functions wrapping the public op_profile methods */ extern "C" { -void op_timing2_start(const char* name); +void op_profile_start(const char* name); -void op_timing2_enter(const char* name); -void op_timing2_enter_kernel(const char* name, const char* target, const char* variant); +void op_profile_enter(const char* name); +void op_profile_enter_kernel(const char* name, const char* target, const char* variant); -void op_timing2_next(const char* name); +void op_profile_next(const char* name); -void op_timing2_exit(); -void op_timing2_finish(); +void op_profile_exit(); +void op_profile_end(); -void op_timing2_output(); -void op_timing2_output_json(const char* filename); +void op_profile_output(); +void op_profile_output_json(const char* filename); } diff --git a/op2/src/externlib/op_timing2.cpp b/op2/src/externlib/op_profile.cpp similarity index 74% rename from op2/src/externlib/op_timing2.cpp rename to op2/src/externlib/op_profile.cpp index 4d4e4a2cc..66a954781 100644 --- a/op2/src/externlib/op_timing2.cpp +++ b/op2/src/externlib/op_profile.cpp @@ -1,4 +1,4 @@ -#include +#include #include #ifdef OPMPI @@ -26,9 +26,9 @@ namespace nlohmann { }; } -/* ----------------------------------------- op_timing2_clock ----------------------------------------- */ +/* ----------------------------------------- op_profile_clock ----------------------------------------- */ -void op_timing2_clock::submit(op_timing2_clock::clock::duration duration) { +void op_profile_clock::submit(op_profile_clock::clock::duration duration) { ++n; if (n == 1) { @@ -45,7 +45,7 @@ void op_timing2_clock::submit(op_timing2_clock::clock::duration duration) { if (duration > max) max = duration; } -op_timing2_clock& op_timing2_clock::operator+=(const op_timing2_clock& other) { +op_profile_clock& op_profile_clock::operator+=(const op_profile_clock& other) { n += other.n; total += other.total; @@ -56,7 +56,7 @@ op_timing2_clock& op_timing2_clock::operator+=(const op_timing2_clock& other) { return *this; } -std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned width) { +std::string format_duration(const op_profile_clock::clock::duration& d, unsigned width) { const char *unit[4] = {"s", "ms", "us", "ns"}; const int unit_width[4] {1, 2, 2, 2}; @@ -96,8 +96,8 @@ std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned return std::string(output); } -std::string to_string(const op_timing2_clock& clock, - const std::optional> parent) { +std::string to_string(const op_profile_clock& clock, + const std::optional> parent) { std::ostringstream oss; oss << "*" << clock.n << " total: " << format_duration(clock.total); @@ -120,7 +120,7 @@ std::string to_string(const op_timing2_clock& clock, return oss.str(); } -void to_json(json& j, const op_timing2_clock& clock) { +void to_json(json& j, const op_profile_clock& clock) { j = json{ {"n", clock.n}, {"total", clock.total}, @@ -129,16 +129,16 @@ void to_json(json& j, const op_timing2_clock& clock) { }; } -void from_json(const json& j, op_timing2_clock& clock) { +void from_json(const json& j, op_profile_clock& clock) { j.at("n").get_to(clock.n); j.at("total").get_to(clock.total); j.at("min").get_to(clock.min); j.at("max").get_to(clock.max); } -/* ----------------------------------------- op_timing2_node ----------------------------------------- */ +/* ----------------------------------------- op_profile_node ----------------------------------------- */ -bool op_timing2_node::has_child(std::string_view name, std::optional child_type) { +bool op_profile_node::has_child(std::string_view name, std::optional child_type) { for (auto& child: children) { if (child.name == name) { if (child_type.has_value()) assert(child.type == *child_type); @@ -149,7 +149,7 @@ bool op_timing2_node::has_child(std::string_view name, std::optional child_type) { +op_profile_node& op_profile_node::get_child(std::string_view name, std::optional child_type) { for (auto& child: children) { if (child.name == name) { if (child_type.has_value()) assert(child.type == *child_type); @@ -158,14 +158,14 @@ op_timing2_node& op_timing2_node::get_child(std::string_view name, std::optional } // Create the child if we don't have one already, setting the type if it was provided - children.push_back(op_timing2_node(name)); + children.push_back(op_profile_node(name)); if (child_type.has_value()) children.back().type = *child_type; return children.back(); } -op_timing2_node& op_timing2_node::get_child(std::vector scope, - std::optional child_type) { +op_profile_node& op_profile_node::get_child(std::vector scope, + std::optional child_type) { assert(scope.size() >= 1); // Recursively call get_child popping the first element off the scope @@ -173,7 +173,7 @@ op_timing2_node& op_timing2_node::get_child(std::vector scope, return get_child(scope[0]).get_child(std::vector(scope.begin() + 1, scope.end()), child_type); } -op_timing2_node& op_timing2_node::operator+=(const op_timing2_node& other) { +op_profile_node& op_profile_node::operator+=(const op_profile_node& other) { clock += other.clock; num_ranks += other.num_ranks; @@ -184,8 +184,8 @@ op_timing2_node& op_timing2_node::operator+=(const op_timing2_node& other) { return *this; } -void op_timing2_node::output(unsigned indent, - const std::optional> parent) { +void op_profile_node::output(unsigned indent, + const std::optional> parent) { std::printf("%*s%s %s\n", indent, "", name.c_str(), to_string(clock, parent.has_value() ? std::optional(parent->get().clock) : std::nullopt).c_str()); @@ -193,7 +193,7 @@ void op_timing2_node::output(unsigned indent, child.output(indent + 4, parent.has_value() ? parent : *this); } -void to_json(json& j, const op_timing2_node& node) { +void to_json(json& j, const op_profile_node& node) { j = json{ {"name", node.name}, {"type", node.type}, @@ -203,7 +203,7 @@ void to_json(json& j, const op_timing2_node& node) { }; } -void from_json(const json& j, op_timing2_node& node) { +void from_json(const json& j, op_profile_node& node) { j.at("name").get_to(node.name); j.at("type").get_to(node.type); j.at("clock").get_to(node.clock); @@ -211,23 +211,23 @@ void from_json(const json& j, op_timing2_node& node) { j.at("children").get_to(node.children); } -/* ----------------------------------------- op_timing2 ----------------------------------------- */ +/* ----------------------------------------- op_profile ----------------------------------------- */ -op_timing2& op_timing2::instance() { - static auto timing = op_timing2{}; +op_profile& op_profile::instance() { + static auto timing = op_profile{}; return timing; } -void op_timing2::set_level(op_timing2_level new_level) { +void op_profile::set_level(op_profile_level new_level) { assert(!started); level = new_level; } -void op_timing2::start(std::string_view name) { +void op_profile::start(std::string_view name) { assert(!started); assert(current_scope.size() == 0); - char *level_str = getenv("OP_TIMING2_LEVEL"); + char *level_str = getenv("OP_PROFILE_LEVEL"); if (level_str != nullptr) { int level_int = -1; @@ -235,33 +235,33 @@ void op_timing2::start(std::string_view name) { level_int = std::stoi(level_str); } catch (...) {}; - if (level_int < 0 || level_int > static_cast(op_timing2_level::kernel_detailed)) - std::printf("warning: OP_TIMING2_LEVEL set to unsupported value: %s\n", level_str); + if (level_int < 0 || level_int > static_cast(op_profile_level::kernel_detailed)) + std::printf("warning: OP_PROFILE_LEVEL set to unsupported value: %s\n", level_str); else - level = static_cast(level_int); + level = static_cast(level_int); } - if (level == op_timing2_level::disabled) return; + if (level == op_profile_level::disabled) return; started = true; deviceSync(); - root = op_timing2_node(name); + root = op_profile_node(name); current_scope.push_back(root); - current_starts.push_back(op_timing2_clock::clock::now()); + current_starts.push_back(op_profile_clock::clock::now()); } -void op_timing2::enter(std::string_view name, bool sync) { - if (level == op_timing2_level::disabled) return; +void op_profile::enter(std::string_view name, bool sync) { + if (level == op_profile_level::disabled) return; assert(started && !finished); assert(current_scope.size() > 0); // Check if we should actually start a timer if (extra_depth > 0 || - (level < op_timing2_level::kernel_detailed && - current_scope.back().get().type == op_timing2_node_type::kernel)) { + (level < op_profile_level::kernel_detailed && + current_scope.back().get().type == op_profile_node_type::kernel)) { extra_depth++; return; } @@ -272,16 +272,16 @@ void op_timing2::enter(std::string_view name, bool sync) { if (sync) deviceSync(); current_scope.push_back(node); - current_starts.push_back(op_timing2_clock::clock::now()); + current_starts.push_back(op_profile_clock::clock::now()); } -void op_timing2::enter_kernel(std::string_view name, std::string_view target, std::string_view variant) { - if (level == op_timing2_level::disabled) return; +void op_profile::enter_kernel(std::string_view name, std::string_view target, std::string_view variant) { + if (level == op_profile_level::disabled) return; assert(started && !finished); assert(current_scope.size() > 0); - if (level < op_timing2_level::kernel) { + if (level < op_profile_level::kernel) { extra_depth++; return; } @@ -293,16 +293,16 @@ void op_timing2::enter_kernel(std::string_view name, std::string_view target, st deviceSync(); enter(full_name); - current_scope.back().get().type = op_timing2_node_type::kernel; + current_scope.back().get().type = op_profile_node_type::kernel; } -void op_timing2::next(std::string_view name) { +void op_profile::next(std::string_view name) { exit(); enter(name, false); } -void op_timing2::exit(bool sync) { - if (level == op_timing2_level::disabled) return; +void op_profile::exit(bool sync) { + if (level == op_profile_level::disabled) return; assert(started && !finished); assert(current_scope.size() > 0); @@ -315,14 +315,14 @@ void op_timing2::exit(bool sync) { if (sync) deviceSync(); auto& node = current_scope.back().get(); - node.clock.submit(op_timing2_clock::clock::now() - current_starts.back()); + node.clock.submit(op_profile_clock::clock::now() - current_starts.back()); current_scope.pop_back(); current_starts.pop_back(); } -void op_timing2::finish() { - if (level == op_timing2_level::disabled) return; +void op_profile::finish() { + if (level == op_profile_level::disabled) return; assert(started && !finished); assert(current_scope.size() > 0); @@ -334,8 +334,8 @@ void op_timing2::finish() { finished = true; } -void op_timing2::combine() { - if (level == op_timing2_level::disabled) return; +void op_profile::combine() { + if (level == op_profile_level::disabled) return; assert(finished); if (combined) return; @@ -355,7 +355,7 @@ void op_timing2::combine() { MPI_Recv(msg.data(), size, MPI_BYTE, rank, 0, OP_MPI_WORLD, MPI_STATUS_IGNORE); json other_root_json = json::from_msgpack(msg); - auto other_root = other_root_json.template get(); + auto other_root = other_root_json.template get(); root += other_root; } @@ -373,8 +373,8 @@ void op_timing2::combine() { combined = true; } -void op_timing2::output() { - if (level == op_timing2_level::disabled) return; +void op_profile::output() { + if (level == op_profile_level::disabled) return; assert(finished); combine(); @@ -385,13 +385,13 @@ void op_timing2::output() { print_summary(); - char *json_filename = getenv("OP_TIMING2_JSON_OUTPUT"); + char *json_filename = getenv("OP_PROFILE_JSON_OUTPUT"); if (json_filename != NULL) output_json(json_filename); } -void op_timing2::output_json(std::string_view filename) { - if (level == op_timing2_level::disabled) return; +void op_profile::output_json(std::string_view filename) { + if (level == op_profile_level::disabled) return; assert(finished); combine(); @@ -414,7 +414,7 @@ void op_timing2::output_json(std::string_view filename) { output << root_json; } -void op_timing2::print_summary() { +void op_profile::print_summary() { // Output the non-kernel sections, and simultaneously gather a list of nodes which have // immediate kernel children std::vector> nodes_with_kernels; @@ -442,13 +442,13 @@ void op_timing2::print_summary() { print_kernel_summary(path, longest_name); } -void op_timing2::print_walk_non_kernel(const op_timing2_node& node, +void op_profile::print_walk_non_kernel(const op_profile_node& node, const std::vector& parent_path, std::vector>& nodes_with_kernels, unsigned indent) { bool has_kernel_child = false; for (auto& child: node.children) - if (child.type == op_timing2_node_type::kernel) has_kernel_child = true; + if (child.type == op_profile_node_type::kernel) has_kernel_child = true; std::vector current_path = parent_path; current_path.push_back(node.name); @@ -459,12 +459,12 @@ void op_timing2::print_walk_non_kernel(const op_timing2_node& node, std::printf("%*s%s %s\n", indent, "", node.name.c_str(), to_string(node.clock).c_str()); for (auto& child: node.children) { - if (child.type == op_timing2_node_type::kernel) continue; + if (child.type == op_profile_node_type::kernel) continue; print_walk_non_kernel(child, current_path, nodes_with_kernels, indent + 4); } } -void op_timing2::print_kernel_summary(const std::vector& path, unsigned longest_name) { +void op_profile::print_kernel_summary(const std::vector& path, unsigned longest_name) { // Print the header, starting with the node path int path_len = 0; for (size_t i = 0; i < path.size(); ++i) { @@ -479,7 +479,7 @@ void op_timing2::print_kernel_summary(const std::vector& path, unsi // And then the column headers for the table std::printf("%*s num total avg min max", longest_name + 4 - path_len, ""); - if (level >= op_timing2_level::kernel_detailed) std::printf(" %%kern"); + if (level >= op_profile_level::kernel_detailed) std::printf(" %%kern"); std::printf("\n"); // Fetch the node so we can print its children @@ -487,9 +487,9 @@ void op_timing2::print_kernel_summary(const std::vector& path, unsi auto& node = scope.size() == 0 ? root : root.get_child(scope); // Gather all kernel children - std::vector> kernel_nodes; + std::vector> kernel_nodes; for (auto& child: node.children) { - if (child.type != op_timing2_node_type::kernel) continue; + if (child.type != op_profile_node_type::kernel) continue; kernel_nodes.push_back(child); } @@ -509,7 +509,7 @@ void op_timing2::print_kernel_summary(const std::vector& path, unsi // Print each kernel child row, with kernel % if level = kernel_detailed auto kern_pct = std::string(""); - if (level >= op_timing2_level::kernel_detailed) { + if (level >= op_profile_level::kernel_detailed) { auto computation_node = child.get().get_child("Computation"); auto kernel_node = computation_node.has_child("Kernel") ? computation_node.get_child("Kernel") : computation_node; @@ -536,7 +536,7 @@ void op_timing2::print_kernel_summary(const std::vector& path, unsi std::printf("\n"); - if (level < op_timing2_level::kernel_detailed) return; + if (level < op_profile_level::kernel_detailed) return; // Print the full tree for the top detailed_limit kernels const auto detailed_limit = 4; @@ -554,19 +554,19 @@ void op_timing2::print_kernel_summary(const std::vector& path, unsi extern "C" { -void op_timing2_start(const char* name) { op_timing2::instance().start(name); } +void op_profile_start(const char* name) { op_profile::instance().start(name); } -void op_timing2_enter(const char* name) { op_timing2::instance().enter(name); } -void op_timing2_enter_kernel(const char* name, const char* target, const char* variant) { - op_timing2::instance().enter_kernel(name, target, variant); +void op_profile_enter(const char* name) { op_profile::instance().enter(name); } +void op_profile_enter_kernel(const char* name, const char* target, const char* variant) { + op_profile::instance().enter_kernel(name, target, variant); } -void op_timing2_next(const char* name) { op_timing2::instance().next(name); } +void op_profile_next(const char* name) { op_profile::instance().next(name); } -void op_timing2_exit() { op_timing2::instance().exit(); } -void op_timing2_finish() { op_timing2::instance().finish(); } +void op_profile_exit() { op_profile::instance().exit(); } +void op_profile_end() { op_profile::instance().finish(); } -void op_timing2_output() { op_timing2::instance().output(); } -void op_timing2_output_json(const char* filename) { op_timing2::instance().output_json(filename); } +void op_profile_output() { op_profile::instance().output(); } +void op_profile_output_json(const char* filename) { op_profile::instance().output_json(filename); } } diff --git a/op2/src/fortran/op2_for_declarations.F90 b/op2/src/fortran/op2_for_declarations.F90 index fa72fecf3..42fe17741 100644 --- a/op2/src/fortran/op2_for_declarations.F90 +++ b/op2/src/fortran/op2_for_declarations.F90 @@ -747,39 +747,39 @@ end function isCNullPointer_c subroutine op_timing_output () BIND(C,name='op_timing_output') end subroutine op_timing_output - subroutine op_timing2_start_c(name) BIND(C,name='op_timing2_start') + subroutine op_profile_start_c(name) BIND(C,name='op_profile_start') use ISO_C_BINDING character(kind=c_char) :: name(*) - end subroutine op_timing2_start_c + end subroutine op_profile_start_c - subroutine op_timing2_enter_c(name) BIND(C,name='op_timing2_enter') + subroutine op_profile_enter_c(name) BIND(C,name='op_profile_enter') use ISO_C_BINDING character(kind=c_char) :: name(*) - end subroutine op_timing2_enter_c + end subroutine op_profile_enter_c - subroutine op_timing2_enter_kernel_c(name, target, variant) BIND(C,name='op_timing2_enter_kernel') + subroutine op_profile_enter_kernel_c(name, target, variant) BIND(C,name='op_profile_enter_kernel') use ISO_C_BINDING character(kind=c_char) :: name(*), target(*), variant(*) - end subroutine op_timing2_enter_kernel_c + end subroutine op_profile_enter_kernel_c - subroutine op_timing2_next_c(name) BIND(C,name='op_timing2_next') + subroutine op_profile_next_c(name) BIND(C,name='op_profile_next') use ISO_C_BINDING character(kind=c_char) :: name(*) - end subroutine op_timing2_next_c + end subroutine op_profile_next_c - subroutine op_timing2_exit() BIND(C,name='op_timing2_exit') - end subroutine op_timing2_exit + subroutine op_profile_exit() BIND(C,name='op_profile_exit') + end subroutine op_profile_exit - subroutine op_timing2_finish() BIND(C,name='op_timing2_finish') - end subroutine op_timing2_finish + subroutine op_profile_end() BIND(C,name='op_profile_end') + end subroutine op_profile_end - subroutine op_timing2_output() BIND(C,name='op_timing2_output') - end subroutine op_timing2_output + subroutine op_profile_output() BIND(C,name='op_profile_output') + end subroutine op_profile_output - subroutine op_timing2_output_json_c(filename) BIND(C,name='op_timing2_output_json') + subroutine op_profile_output_json_c(filename) BIND(C,name='op_profile_output_json') use ISO_C_BINDING character(kind=c_char) :: filename(*) - end subroutine op_timing2_output_json_c + end subroutine op_profile_output_json_c subroutine op_print_c (line) BIND(C,name='op_print') use ISO_C_BINDING @@ -2152,53 +2152,53 @@ function get_associated_set_size ( dat ) end function - subroutine op_timing2_start(name) + subroutine op_profile_start(name) use, intrinsic :: ISO_C_BINDING implicit none character(kind=c_char, len=*) :: name - call op_timing2_start_c(name /@/ C_NULL_CHAR) + call op_profile_start_c(name /@/ C_NULL_CHAR) end subroutine - subroutine op_timing2_enter(name) + subroutine op_profile_enter(name) use, intrinsic :: ISO_C_BINDING implicit none character(kind=c_char, len=*) :: name - call op_timing2_enter_c(name /@/ C_NULL_CHAR) + call op_profile_enter_c(name /@/ C_NULL_CHAR) end subroutine - subroutine op_timing2_enter_kernel(name, target, variant) + subroutine op_profile_enter_kernel(name, target, variant) use, intrinsic :: ISO_C_BINDING implicit none character(kind=c_char, len=*) :: name, target, variant - call op_timing2_enter_kernel_c(name /@/ C_NULL_CHAR, target /@/ C_NULL_CHAR, variant /@/ C_NULL_CHAR) + call op_profile_enter_kernel_c(name /@/ C_NULL_CHAR, target /@/ C_NULL_CHAR, variant /@/ C_NULL_CHAR) end subroutine - subroutine op_timing2_next(name) + subroutine op_profile_next(name) use, intrinsic :: ISO_C_BINDING implicit none character(kind=c_char, len=*) :: name - call op_timing2_next_c(name /@/ C_NULL_CHAR) + call op_profile_next_c(name /@/ C_NULL_CHAR) end subroutine - subroutine op_timing2_output_json(filename) + subroutine op_profile_output_json(filename) use, intrinsic :: ISO_C_BINDING implicit none character(kind=c_char, len=*) :: filename - call op_timing2_output_json_c(filename /@/ C_NULL_CHAR) + call op_profile_output_json_c(filename /@/ C_NULL_CHAR) end subroutine diff --git a/op2/src/mpi/op_mpi_part_core.cpp b/op2/src/mpi/op_mpi_part_core.cpp index b2d06ed35..22d9defb6 100644 --- a/op2/src/mpi/op_mpi_part_core.cpp +++ b/op2/src/mpi/op_mpi_part_core.cpp @@ -3515,6 +3515,18 @@ construct_adj_list(op_map primary_map, halo_list exp_list, halo_list imp_list, return std::make_tuple(adj, adj_i, adj_cap); } +#ifdef DEBUG +static inline void check_global_index_int32_range(idx_g_t g_index, + const op_map primary_map, + int my_rank) { + if (g_index < (idx_g_t)INT32_MIN || g_index > (idx_g_t)INT32_MAX) { + op_printf("Error: global index out of 32-bit integer range for map %s on rank %d (index=%lld)\n", + primary_map->name, my_rank, (long long)g_index); + MPI_Abort(OP_PART_WORLD, 2); + } +} +#endif + /******************************************************************************* * Setup variables for k-way partitioning *******************************************************************************/ @@ -3540,6 +3552,11 @@ setup_part_data(op_map primary_map, int my_rank, int comm_size, idx_g_t **adj, for (int i = 0; i < primary_map->to->size; i++) { idx_g_t g_index = get_global_index( i, my_rank, part_range[primary_map->to->index], comm_size); +#ifdef DEBUG + if constexpr (sizeof(T) == sizeof(int)) { + check_global_index_int32_range(g_index, primary_map, my_rank); + } +#endif op_sort(adj[i], adj_i[i]); adj_i[i] = removeDups(adj[i], adj_i[i]); diff --git a/tests/functional/const/const_tests.cpp b/tests/functional/const/const_tests.cpp index 0f783e908..a9d1889db 100644 --- a/tests/functional/const/const_tests.cpp +++ b/tests/functional/const/const_tests.cpp @@ -1,7 +1,7 @@ // Not intended to be used with OP_NO_REALLOC flag #include "op_seq.h" -#include "op_timing2.h" +#include "op_profile.h" #include #define TOL 1e-9 @@ -32,7 +32,7 @@ void consts4(double *dat) { int main(int argc, char **argv) { op_init(argc, argv, 2); - op_timing2_start("CppConstTests"); + op_profile_start("CppConstTests"); constexpr int size = 32; op_set set = op_decl_set(size, "my_set");; @@ -71,8 +71,8 @@ int main(int argc, char **argv) { printf("consts4 passed\n"); } - op_timing2_finish(); - op_timing2_output(); + op_profile_end(); + op_profile_output(); op_exit(); diff --git a/tests/functional/const_fortran/const_tests.F90 b/tests/functional/const_fortran/const_tests.F90 index 74a1bdac2..63d0ecf27 100644 --- a/tests/functional/const_fortran/const_tests.F90 +++ b/tests/functional/const_fortran/const_tests.F90 @@ -29,7 +29,7 @@ program const_tests_fortran integer :: i, d call op_init_base(0, 0) - call op_timing2_start("FortranConstTests") + call op_profile_start("FortranConstTests") call op_decl_set(size, set, "my_set") write(*,*) "set size =", set%setPtr%size @@ -73,10 +73,10 @@ program const_tests_fortran end do write(*,*) "consts4 passed" - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/tests/functional/dat_reductions/reduc_tests.cpp b/tests/functional/dat_reductions/reduc_tests.cpp index 5a1b1c03e..6621c1f69 100644 --- a/tests/functional/dat_reductions/reduc_tests.cpp +++ b/tests/functional/dat_reductions/reduc_tests.cpp @@ -5,7 +5,7 @@ #endif #include "op_seq.h" -#include "op_timing2.h" +#include "op_profile.h" #include "../utility.h" @@ -45,7 +45,7 @@ void indirect_dat3_inc(float *n0i, float *n1i, const float *er) { int main(int argc, char **argv) { op_init(argc, argv, 2); - op_timing2_start("CppReductionTests"); + op_profile_start("CppReductionTests"); int my_rank = 0; int comm_size = 1; @@ -192,8 +192,8 @@ int main(int argc, char **argv) { printf("direct_dat4_inc passed [rank %d]\n", my_rank); } - op_timing2_finish(); - op_timing2_output(); + op_profile_end(); + op_profile_output(); op_exit(); diff --git a/tests/functional/dat_reductions_fortran/reduc_tests.F90 b/tests/functional/dat_reductions_fortran/reduc_tests.F90 index 7b1a83172..28f6d5f41 100644 --- a/tests/functional/dat_reductions_fortran/reduc_tests.F90 +++ b/tests/functional/dat_reductions_fortran/reduc_tests.F90 @@ -45,7 +45,7 @@ program reduc_tests_fortran real(4), dimension(:), allocatable :: expected call op_init_base(0, 0) - call op_timing2_start("FortranReductionTests") + call op_profile_start("FortranReductionTests") call get_rank_and_size(my_rank, comm_size) @@ -187,10 +187,10 @@ program reduc_tests_fortran deallocate(fetched) - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/tests/functional/gbl/gbl_tests.cpp b/tests/functional/gbl/gbl_tests.cpp index 3054683ba..a43573246 100644 --- a/tests/functional/gbl/gbl_tests.cpp +++ b/tests/functional/gbl/gbl_tests.cpp @@ -5,7 +5,7 @@ #endif #include "op_seq.h" -#include "op_timing2.h" +#include "op_profile.h" #include "../utility.h" @@ -57,7 +57,7 @@ void max5(const double *dat, double *g) { int main(int argc, char **argv) { op_init(argc, argv, 2); - op_timing2_start("CppGblArgTests"); + op_profile_start("CppGblArgTests"); int my_rank = 0; int comm_size = 1; @@ -219,8 +219,8 @@ int main(int argc, char **argv) { printf("max5 passed\n"); } - op_timing2_finish(); - op_timing2_output(); + op_profile_end(); + op_profile_output(); op_exit(); diff --git a/tests/functional/gbl_fortran/gbl_tests.F90 b/tests/functional/gbl_fortran/gbl_tests.F90 index b1bd23fa1..88f2eb177 100644 --- a/tests/functional/gbl_fortran/gbl_tests.F90 +++ b/tests/functional/gbl_fortran/gbl_tests.F90 @@ -49,7 +49,7 @@ program gbl_tests_fortran #endif call op_init_base(0, 0) - call op_timing2_start("FortranGblArgTests") + call op_profile_start("FortranGblArgTests") call get_rank_and_size(my_rank, comm_size) @@ -182,10 +182,10 @@ program gbl_tests_fortran end do write(*,*) "max5 passed" - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/tests/functional/idx/idx_tests.cpp b/tests/functional/idx/idx_tests.cpp index 7ef12a24b..7caedb069 100644 --- a/tests/functional/idx/idx_tests.cpp +++ b/tests/functional/idx/idx_tests.cpp @@ -5,6 +5,7 @@ #endif #include "op_seq.h" +#include "op_profile.h" #include "../utility.h" @@ -43,6 +44,7 @@ void write_mixed_idx(double *dat, const int *direct_idx, const int *idx0, int main(int argc, char **argv) { op_init(argc, argv, 2); + op_profile_start("CppIdxTests"); int my_rank = 0; int comm_size = 1; @@ -157,6 +159,9 @@ int main(int argc, char **argv) { printf("mixed direct and indirect idx passed [rank %d]\n", my_rank); } + op_profile_end(); + op_profile_output(); + op_exit(); return 0; diff --git a/tests/functional/idx_fortran/idx_tests.F90 b/tests/functional/idx_fortran/idx_tests.F90 index d1e1bf557..34cd023a8 100644 --- a/tests/functional/idx_fortran/idx_tests.F90 +++ b/tests/functional/idx_fortran/idx_tests.F90 @@ -42,7 +42,7 @@ program idx_tests_fortran real(8) :: expected call op_init_base(0, 0) - call op_timing2_start("FortranIdxTests") + call op_profile_start("FortranIdxTests") call get_rank_and_size(my_rank, comm_size) @@ -157,10 +157,10 @@ program idx_tests_fortran deallocate(fetched) - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/tests/functional/strides/stride_tests.cpp b/tests/functional/strides/stride_tests.cpp index 28ede30f6..ab1e92f4c 100644 --- a/tests/functional/strides/stride_tests.cpp +++ b/tests/functional/strides/stride_tests.cpp @@ -5,7 +5,7 @@ #endif #include "op_seq.h" -#include "op_timing2.h" +#include "op_profile.h" #include "../utility.h" @@ -41,7 +41,7 @@ void write5_within_kernel(double *dat0, double *dat1, const double *read) { int main(int argc, char **argv) { op_init(argc, argv, 2); - op_timing2_start("CppStrideTests"); + op_profile_start("CppStrideTests"); int my_rank = 0; int comm_size = 1; @@ -112,8 +112,8 @@ int main(int argc, char **argv) { printf("write5_within_kernel passed\n"); } - op_timing2_finish(); - op_timing2_output(); + op_profile_end(); + op_profile_output(); op_exit(); diff --git a/tests/functional/strides_fortran/stride_tests.F90 b/tests/functional/strides_fortran/stride_tests.F90 index 4e191bfb1..d63cbfd33 100644 --- a/tests/functional/strides_fortran/stride_tests.F90 +++ b/tests/functional/strides_fortran/stride_tests.F90 @@ -40,7 +40,7 @@ program stride_tests_fortran dim2 = 5 call op_init_base(0, 0) - call op_timing2_start("FortranStrideTests") + call op_profile_start("FortranStrideTests") call get_rank_and_size(my_rank, comm_size) @@ -106,10 +106,10 @@ program stride_tests_fortran end do write(*,*) "write5_within_kernel passed" - call op_timing2_finish() + call op_profile_end() if (op_is_root() == 1) print * - call op_timing2_output() + call op_profile_output() call op_exit() diff --git a/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja index e68105789..fe77087b5 100644 --- a/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja +++ b/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja @@ -340,8 +340,11 @@ op_cuda_{{lh.name}}<< 0 else "2"}}; ++round ) { - if (round == 1) + if (round == 1) { + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2); + op_profile_next("Computation"); + } {% if lh.args|gbl|reduction|length > 0 %} int start = round == 0 ? 0 : (round == 1 ? set->core_size : set->size); @@ -366,8 +369,11 @@ op_cuda_{{lh.name}}<<ncolors; ++col) { - if (col == plan->ncolors_core) + if (col == plan->ncolors_core) { + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2); + op_profile_next("Computation"); + } int start = plan->col_offsets[0][col]; int end = plan->col_offsets[0][col + 1]; @@ -408,6 +414,9 @@ op_cuda_{{lh.name}}<< 0 %} + op_profile_next("MPI Reduce"); + {% for arg in lh.args|gbl|reduction %} {% call opt_if(arg) %} arg{{arg.id}}.data = (char *)arg{{arg.id}}_host_data; @@ -415,8 +424,12 @@ op_cuda_{{lh.name}}<< 1 %}[{{const.dim}}]{ {{super()}} #include "op_cuda_rt_support.h" #include "op_cuda_reduction.h" +#include {% endblock %} {% block const_decl_func %} diff --git a/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja index b633a452c..c7aee237f 100644 --- a/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja +++ b/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja @@ -338,8 +338,11 @@ op_hip_{{lh.name}}<< 0 else "2"}}; ++round ) { - if (round == 1) + if (round == 1) { + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2); + op_profile_next("Computation"); + } {% if lh.args|gbl|reduction|length > 0 %} int start = round == 0 ? 0 : (round == 1 ? set->core_size : set->size); @@ -364,8 +367,11 @@ op_hip_{{lh.name}}<<ncolors; ++col) { - if (col == plan->ncolors_core) + if (col == plan->ncolors_core) { + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2); + op_profile_next("Computation"); + } int start = plan->col_offsets[0][col]; int end = plan->col_offsets[0][col + 1]; @@ -406,6 +412,9 @@ op_hip_{{lh.name}}<< 0 %} + op_profile_next("MPI Reduce"); + {% for arg in lh.args|gbl|reduction %} {% call opt_if(arg) %} arg{{arg.id}}.data = (char *)arg{{arg.id}}_host_data; @@ -413,8 +422,12 @@ op_hip_{{lh.name}}<< 0 %} unsigned opt_flags = 0; @@ -534,7 +534,7 @@ void op_par_loop_{{lh.name}}( {% endfor %} {% endif %} - // op_timing2_enter("Prepare GBLs"); + op_profile_enter("Prepare GBLs"); prepareDeviceGbls(args, n_args, block_size * max_blocks); bool exit_sync = false; @@ -542,7 +542,7 @@ void op_par_loop_{{lh.name}}( arg{{arg.id}} = args[{{loop.index0}}]; {% endfor %} - // op_timing2_next("Update GBL Refs"); + op_profile_next("Update GBL Refs"); {% for arg in args_gbl_per_thread|select2("min", "max", "work") %} if (gbl{{arg.id}}_ref_d == nullptr{{" && arg%s.opt == 1" % arg.id if arg is opt}}) { CUDA_SAFE_CALL({{api_prefix}}Malloc(&gbl{{arg.id}}_ref_d, {{gbl_dim(arg)}} * sizeof({{arg.typ.c()}}))); @@ -553,36 +553,36 @@ void op_par_loop_{{lh.name}}( {% endfor %} {% if args_gbl_per_thread|length > 0 %} - // op_timing2_next("Init GBLs"); + op_profile_next("Init GBLs"); int stride_gbl = block_size * max_blocks; {{init_gbls()|indent}} {% endif %} - // op_timing2_exit(); - // op_timing2_next("Computation"); + op_profile_exit(); + op_profile_next("Computation"); {% if lh is direct %} int start = 0; int end = set->size; - // op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); int size = f2c::round32(set->size); {{kernel_call()|indent}} - // op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent}} - // op_timing2_exit(); + op_profile_exit(); {% elif config.atomics %} - // op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); for (int round = 1; round < sections.size(); ++round) { if (round == 2) { - // op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(n_args, args, 2); - // op_timing2_next("Kernel"); + op_profile_next("Kernel"); } int start = sections[round - 1]; @@ -598,22 +598,22 @@ void op_par_loop_{{lh.name}}( {% if lh.args|gbl|reject("read")|list|length > 0 %} if (round == 2) { - // op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent(12)}} - // op_timing2_next("Kernel"); + op_profile_next("Kernel"); } {% endif %} } - // op_timing2_exit(); + op_profile_exit(); {% else %} - // op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); for (int col = 0; col < plan->ncolors; ++col) { if (col == plan->ncolors_core) { - // op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(n_args, args, 2); - // op_timing2_next("Kernel"); + op_profile_next("Kernel"); } int start = plan->col_offsets[0][col]; @@ -627,19 +627,19 @@ void op_par_loop_{{lh.name}}( {% if lh.args|gbl|reject("read")|list|length > 0 %} if (col == plan->ncolors_owned - 1) { - // op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent(12)}} - // op_timing2_next("Kernel"); + op_profile_next("Kernel"); } {% endif %} } - // op_timing2_exit(); + op_profile_exit(); {% endif %} - // op_timing2_exit(); + op_profile_exit(); - // op_timing2_enter("Finalise"); + op_profile_enter("Finalise"); {% for arg in lh.args|gbl|reduction %} op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data); {% endfor %} @@ -647,6 +647,6 @@ void op_par_loop_{{lh.name}}( op_mpi_set_dirtybit_cuda(n_args, args); if (exit_sync) CUDA_SAFE_CALL({{api_prefix}}StreamSynchronize(0)); - // op_timing2_exit(); - // op_timing2_exit(); + op_profile_exit(); + op_profile_exit(); } diff --git a/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja index 174e7a470..a962fe886 100644 --- a/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja +++ b/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja @@ -31,7 +31,7 @@ INCTXT(OP_F2C_PRELUDE, "op_f2c_prelude.h"); #include #include -#include +#include #include #include diff --git a/translator-v2/resources/templates/cpp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/loop_host.hpp.jinja index 94c64aaed..f4aa14906 100644 --- a/translator-v2/resources/templates/cpp/loop_host.hpp.jinja +++ b/translator-v2/resources/templates/cpp/loop_host.hpp.jinja @@ -21,6 +21,7 @@ int dats_indirect[{{lh.args_expanded|length}}] = { {% endif %} {% endmacro %} {% block prologue %} +#include {% endblock %} {% block kernel %} namespace op2_k{{kernel_idx}} { @@ -53,11 +54,16 @@ void op_par_loop_{{lh.name}}( op_timers_core(&cpu_start, &wall_start); + op_profile_enter_kernel(name, "{{config.target}}", "{{"Direct" if lh is direct else "Indirect"}}"); + op_profile_enter("MPI Exchanges"); + if (OP_diags > 2) printf(" kernel routine ({{"direct" if lh is direct else "indirect"}}): {{lh.name}}\n"); int set_size = op_mpi_halo_exchanges{{"_grouped" if config.grouped-}} (set, num_args_expanded, args_expanded{{(", %d" % config.device) if config.grouped}}); + + op_profile_next("Computation"); {% endblock %} {% block host_loop required %} diff --git a/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja index 68438082f..1f2b86e9c 100644 --- a/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja +++ b/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja @@ -246,8 +246,11 @@ void {{lh.name}}_wrapper( {% else %} int block_offset = 0; for (int col = 0; col < plan->ncolors; ++col) { - if (col == plan->ncolors_core) + if (col == plan->ncolors_core) { + op_profile_next("MPI Wait"); op_mpi_wait_all(num_args_expanded, args_expanded); + op_profile_next("Computation"); + } int num_blocks = plan->ncolblk[col]; @@ -312,6 +315,7 @@ void {{lh.name}}_wrapper( {% block host_epilogue %} {% if lh is indirect -%} {# TODO: is this indirect check necessary? #} + op_profile_next("MPI Wait"); if (set_size == set->core_size) op_mpi_wait_all(num_args_expanded, args_expanded); @@ -338,10 +342,17 @@ void {{lh.name}}_wrapper( {% endfor %} {% endif %} + {% if lh.args|gbl|reduction|length > 0 %} + op_profile_next("MPI Reduce"); + {% for arg in lh.args|gbl|reduction %} op_mpi_reduce(&arg{{arg.id}}, gbl{{arg.id}}); {% endfor %} + {% endif %} + op_profile_exit(); + op_mpi_set_dirtybit(num_args_expanded, args_expanded); + op_profile_exit(); {{super()}} {% endblock %} diff --git a/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja index a61fc2f2b..6ce103a0e 100644 --- a/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja +++ b/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja @@ -65,6 +65,7 @@ info{{arg.id}}_temp {%- endmacro -%} #include +#include #include #include @@ -89,8 +90,13 @@ void op_par_loop_{{lh.name}}( args[{{loop.index0}}] = arg{{arg.id}}; {% endfor %} + op_profile_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}"); + + op_profile_enter("MPI Exchanges"); int n_exec = op_mpi_halo_exchanges(set, n_args, args); + op_profile_next("Computation"); + {% for arg in lh.args|gbl|reject("read") if lh is indirect %} {{arg.typ.c()}} gbl{{arg.id}}_temp[{{arg_dim(arg)}}]; {% endfor %} @@ -108,7 +114,9 @@ void op_par_loop_{{lh.name}}( for (int n = 0; n < n_exec; ++n) { {% if lh is indirect %} if (n == set->core_size) { + op_profile_next("MPI Wait"); op_mpi_wait_all(n_args, args); + op_profile_next("Computation"); } {% for map in lh.maps %} @@ -152,14 +160,20 @@ void op_par_loop_{{lh.name}}( } {% endif %} + op_profile_next("MPI Wait"); if (n_exec == 0 || n_exec == set->core_size) op_mpi_wait_all(n_args, args); {% if lh.args|gbl|reduction|length > 0 %} + op_profile_next("MPI Reduce"); + {% for arg in lh.args|gbl|reduction %} op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data); {% endfor %} {% endif %} + op_profile_exit(); + op_mpi_set_dirtybit(n_args, args); + op_profile_exit(); } diff --git a/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja b/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja index 6925b4ae6..0b34a3674 100644 --- a/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja +++ b/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja @@ -319,6 +319,8 @@ void op2_k_{{lh.name}}{{variant}}_wrapper( const char op2_k_{{lh.name}}{{variant}}_src[] = R"_op2_k( namespace op2_m_{{lh.name}}{{variant}} { +using int64_t = long long int; + {{kernel_func}}} {{kernel_wrapper(jit=true)}} @@ -374,10 +376,10 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( int n_args = {{lh.args|length}}; op_arg args[{{lh.args|length}}]; - op_timing2_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}"); - op_timing2_enter("Init"); + op_profile_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}"); + op_profile_enter("Init"); - op_timing2_enter("Kernel Info Setup"); + op_profile_enter("Kernel Info Setup"); static bool first_invocation = true; static op::f2c::KernelInfo info("op2_k_{{lh.name}}{{variant}}_wrapper", @@ -409,12 +411,12 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( args[{{loop.index0}}] = arg{{arg.id}}; {% endfor %} - op_timing2_next("MPI Exchanges"); + op_profile_next("MPI Exchanges"); int n_exec = op_mpi_halo_exchanges_grouped(set, n_args, args, 2); if (n_exec == 0) { - op_timing2_exit(); - op_timing2_exit(); + op_profile_exit(); + op_profile_exit(); op_mpi_wait_all_grouped(n_args, args, 2); @@ -423,7 +425,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {% endfor %} op_mpi_set_dirtybit_cuda(n_args, args); - op_timing2_exit(); + op_profile_exit(); return; } @@ -444,9 +446,9 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( static {{arg.typ.c()}}* gbl{{arg.id}}_ref_d = nullptr; {% endfor %} - op_timing2_next("Get Kernel"); + op_profile_next("Get Kernel"); auto *kernel_inst = info.get_kernel(); - op_timing2_exit(); + op_profile_exit(); {% if lh is direct %} auto [block_limit, block_size] = info.get_launch_config(kernel_inst, set->size); @@ -479,7 +481,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {%- endfor -%} }; - op_timing2_enter("Plan"); + op_profile_enter("Plan"); #ifdef OP_PART_SIZE_{{kernel_idx}} int part_size = OP_PART_SIZE_{{kernel_idx}}; @@ -511,7 +513,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( } max_blocks = std::min(max_blocks, block_limit); - op_timing2_exit(); + op_profile_exit(); {% endif %} {% if lh.args|opt|length > 0 %} @@ -521,7 +523,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {% endfor %} {% endif %} - op_timing2_enter("Prepare GBLs"); + op_profile_enter("Prepare GBLs"); prepareDeviceGbls(args, n_args, block_size * max_blocks); bool exit_sync = false; @@ -529,7 +531,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( arg{{arg.id}} = args[{{loop.index0}}]; {% endfor %} - op_timing2_next("Update GBL Refs"); + op_profile_next("Update GBL Refs"); {% for arg in args_gbl_per_thread|select2("min", "max", "work") %} if (gbl{{arg.id}}_ref_d == nullptr{{" && arg%s.opt == 1" % arg.id if arg is opt}}) { CUDA_SAFE_CALL({{api_prefix}}Malloc(&gbl{{arg.id}}_ref_d, {{gbl_dim(arg)}} * sizeof({{arg.typ.c()}}))); @@ -540,36 +542,36 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {% endfor %} {% if args_gbl_per_thread|length > 0 %} - op_timing2_next("Init GBLs"); + op_profile_next("Init GBLs"); int stride_gbl = block_size * max_blocks; {{init_gbls()|indent}} {% endif %} - op_timing2_exit(); - op_timing2_next("Computation"); + op_profile_exit(); + op_profile_next("Computation"); {% if lh is direct %} int start = 0; int end = set->size; - op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); int size = f2c::round32(set->size); {{kernel_call()|indent}} - op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent}} - op_timing2_exit(); + op_profile_exit(); {% elif config.atomics %} - op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); for (int round = 1; round < sections.size(); ++round) { if (round == 2) { - op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(n_args, args, 2); - op_timing2_next("Kernel"); + op_profile_next("Kernel"); } int start = sections[round - 1]; @@ -585,22 +587,22 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {% if lh.args|gbl|reject("read")|list|length > 0 %} if (round == 2) { - op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent(12)}} - op_timing2_next("Kernel"); + op_profile_next("Kernel"); } {% endif %} } - op_timing2_exit(); + op_profile_exit(); {% else %} - op_timing2_enter("Kernel"); + op_profile_enter("Kernel"); for (int col = 0; col < plan->ncolors; ++col) { if (col == plan->ncolors_core) { - op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); op_mpi_wait_all_grouped(n_args, args, 2); - op_timing2_next("Kernel"); + op_profile_next("Kernel"); } int start = plan->col_offsets[0][col]; @@ -614,19 +616,19 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( {% if lh.args|gbl|reject("read")|list|length > 0 %} if (col == plan->ncolors_owned - 1) { - op_timing2_next("Process GBLs"); + op_profile_next("Process GBLs"); {{process_gbls()|indent(12)}} - op_timing2_next("Kernel"); + op_profile_next("Kernel"); } {% endif %} } - op_timing2_exit(); + op_profile_exit(); {% endif %} - op_timing2_exit(); + op_profile_exit(); - op_timing2_enter("Finalise"); + op_profile_enter("Finalise"); {% for arg in lh.args|gbl|reduction %} op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data); {% endfor %} @@ -634,6 +636,6 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( op_mpi_set_dirtybit_cuda(n_args, args); if (exit_sync) CUDA_SAFE_CALL({{api_prefix}}StreamSynchronize(0)); - op_timing2_exit(); - op_timing2_exit(); + op_profile_exit(); + op_profile_exit(); } diff --git a/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja index e71dbcdcc..9b2cbdbb4 100644 --- a/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja +++ b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja @@ -52,7 +52,7 @@ INCTXT(OP_F2C_PRELUDE, "op_f2c_prelude.h"); #include #include -#include +#include #include #include diff --git a/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja b/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja index 62bb3ce7f..9dc01581d 100644 --- a/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja +++ b/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja @@ -83,7 +83,7 @@ info{{arg.id}}_temp #include #include -#include +#include #include #include @@ -109,12 +109,12 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( args[{{loop.index0}}] = arg{{arg.id}}; {% endfor %} - op_timing2_enter_kernel("{{lh.name}}", "c_seq", "{{variant_str()}}"); + op_profile_enter_kernel("{{lh.name}}", "c_seq", "{{variant_str()}}"); - op_timing2_enter("MPI Exchanges"); + op_profile_enter("MPI Exchanges"); int n_exec = op_mpi_halo_exchanges(set, n_args, args); - op_timing2_next("Computation"); + op_profile_next("Computation"); {% for arg in lh.args|gbl|reject("read") if lh is indirect %} {{arg.typ.c()}} gbl{{arg.id}}_temp[{{arg_dim(arg)}}]; @@ -138,9 +138,9 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( for (int n = 0; n < n_exec; ++n) { {% if lh is indirect %} if (n == set->core_size) { - op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); op_mpi_wait_all(n_args, args); - op_timing2_next("Computation"); + op_profile_next("Computation"); } {% for map in lh.maps %} @@ -184,19 +184,19 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c( } {% endif %} - op_timing2_next("MPI Wait"); + op_profile_next("MPI Wait"); if (n_exec == 0 || n_exec == set->core_size) op_mpi_wait_all(n_args, args); {% if lh.args|gbl|reduction|length > 0 %} - op_timing2_next("MPI Reduce"); + op_profile_next("MPI Reduce"); {% for arg in lh.args|gbl|reduction %} op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data); {% endfor %} {% endif %} - op_timing2_exit(); + op_profile_exit(); op_mpi_set_dirtybit(n_args, args); - op_timing2_exit(); + op_profile_exit(); } diff --git a/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja b/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja index a212a027d..0165efde1 100644 --- a/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja +++ b/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja @@ -386,15 +386,15 @@ subroutine op2_k_{{lh.name}}{{variant}}( & args({{loop.index}}) = arg{{arg.id}} {% endfor %} - call op_timing2_enter_kernel("{{lh.name}}", "CUDA", "{{variant_str()}}") - call op_timing2_enter("Init") + call op_profile_enter_kernel("{{lh.name}}", "CUDA", "{{variant_str()}}") + call op_profile_enter("Init") - call op_timing2_enter("MPI Exchanges") + call op_profile_enter("MPI Exchanges") n_exec = op_mpi_halo_exchanges_grouped(set%setcptr, size(args), args, 2) if (n_exec == 0) then - call op_timing2_exit() - call op_timing2_exit() + call op_profile_exit() + call op_profile_exit() call op_mpi_wait_all_grouped(size(args), args, 2) {% for arg in lh.args|gbl|reduction %} @@ -407,15 +407,15 @@ subroutine op2_k_{{lh.name}}{{variant}}( & print *, cudaGetErrorString(err) end if - call op_timing2_exit() + call op_profile_exit() return end if - call op_timing2_next("Update consts") + call op_profile_next("Update consts") {% for const in lh.consts %} call op_update_const_cuda_{{const}}(){{"\n" if loop.last}} {% endfor %} - call op_timing2_exit() + call op_profile_exit() call setGblIncAtomic(logical({{".true." if config.gbl_inc_atomic else ".false."}}, c_bool)) block_size = getBlockSize(name // c_null_char, set%setptr%size) @@ -438,7 +438,7 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {%- endfor -%} /) - call op_timing2_enter("Plan") + call op_profile_enter("Plan") part_size = getpartitionsize(name // c_null_char, set%setptr%size) plan => fortranplancaller( & @@ -466,12 +466,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( & max_blocks = max(max_blocks, num_blocks) end do - call op_timing2_exit() + call op_profile_exit() {% endif %} - call op_timing2_enter("Prepare GBLs") + call op_profile_enter("Prepare GBLs") call prepareDeviceGbls(args, size(args), block_size * max_blocks) - call op_timing2_exit() + call op_profile_exit() {% for arg in lh.args %} arg{{arg.id}} = args({{loop.index}}) @@ -524,23 +524,23 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {% endfor %} {% if args_gbl_per_thread|length > 0 %} - call op_timing2_enter("Init GBLs") + call op_profile_enter("Init GBLs") {{init_gbls()|indent}} - call op_timing2_exit() + call op_profile_exit() {% endif %} - call op_timing2_next("Computation") + call op_profile_next("Computation") {% if lh is direct %} start = 0 end = set%setptr%size - call op_timing2_enter("Kernel") + call op_profile_enter("Kernel") {{kernel_call("set%setptr%size")|indent}}{{"\n" if lh.args|gbl|reject("read")|list|length > 0}} - call op_timing2_next("Process GBLs") + call op_profile_next("Process GBLs") {{process_gbls()|indent}} - call op_timing2_exit() + call op_profile_exit() {% elif config.atomics %} {% if lh.args|gbl|reduction|length == 0 %} sections = (/0, set%setptr%core_size, set%setptr%size + set%setptr%exec_size, 0/) @@ -548,12 +548,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( & sections = (/0, set%setptr%core_size, set%setptr%size, set%setptr%size + set%setptr%exec_size/) {% endif %} - call op_timing2_enter("Kernel") + call op_profile_enter("Kernel") do round = 1, {{"3" if lh.args|gbl|reduction|length > 0 else "2"}} if (round == 2) then - call op_timing2_next("MPI Wait") + call op_profile_next("MPI Wait") call op_mpi_wait_all_grouped(size(args), args, 2) - call op_timing2_next("Kernel") + call op_profile_next("Kernel") end if start = sections(round) @@ -568,21 +568,21 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {% if lh.args|gbl|reject("read")|list|length > 0 %} if (round == 2) then - call op_timing2_next("Process GBLs") + call op_profile_next("Process GBLs") {{process_gbls()|indent(12)}} - call op_timing2_next("Kernel") + call op_profile_next("Kernel") end if {% endif %} end do - call op_timing2_exit() + call op_profile_exit() {% else %} - call op_timing2_enter("Kernel") + call op_profile_enter("Kernel") do col = 1, plan%ncolors if (col == plan%ncolors_core + 1) then - call op_timing2_next("MPI Wait") + call op_profile_next("MPI Wait") call op_mpi_wait_all_grouped(size(args), args, 2) - call op_timing2_next("Kernel") + call op_profile_next("Kernel") end if start = plan_color2_offsets(col) @@ -595,18 +595,18 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {% if lh.args|gbl|reject("read")|list|length > 0 %} if (col == plan%ncolors_owned) then - call op_timing2_next("Process GBLs") + call op_profile_next("Process GBLs") {{process_gbls()|indent(12)}} - call op_timing2_next("Kernel") + call op_profile_next("Kernel") end if {% endif %} end do - call op_timing2_exit() + call op_profile_exit() {% endif %} - call op_timing2_exit() + call op_profile_exit() - call op_timing2_enter("Finalise") + call op_profile_enter("Finalise") {% for arg in lh.args|gbl|reduction %} call op_mpi_reduce_{{type_c(arg)}}(arg{{arg.id}}, arg{{arg.id}}%data) {% endfor %} @@ -618,8 +618,8 @@ subroutine op2_k_{{lh.name}}{{variant}}( & print *, cudaGetErrorString(err) end if - call op_timing2_exit() - call op_timing2_exit() + call op_profile_exit() + call op_profile_exit() end subroutine end module diff --git a/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja b/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja index 0b9d7d786..2acc85dbc 100644 --- a/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja +++ b/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja @@ -109,9 +109,6 @@ subroutine {{lh.kernel}}_wrapper2( & {{arg.typ}}, dimension({{arg.dim}}) :: info{{arg.id}}{{"\n" if loop.last}} {% endfor %} integer(4) :: start, end -{% for dat in lh.dats if dat.dim is none %} - integer(4) :: dat{{dat.arg_id}}_dim -{% endfor %} ! locals integer(4) :: n @@ -461,8 +458,6 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {% for arg in lh.args|info %} {{arg.typ}}, pointer, dimension(:) :: info{{arg.id}}{{"\n" if loop.last}} {% endfor %} - real(8) :: start_time, end_time - real(4) :: transfer {% for dat in lh.dats if dat.dim is none %} dat{{dat.arg_id}}_dim = arg{{dat.arg_id}}%dim @@ -480,9 +475,13 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {%- endfor -%} /) - call op_timers_core(start_time) + call op_profile_enter_kernel("{{lh.name}}", "openmp", "{{"Direct" if lh is direct else "Indirect"}}") + + call op_profile_enter("MPI Exchanges") set_size = op_mpi_halo_exchanges(set%setcptr, size(args), args) + call op_profile_next("Computation") + {% for dat in lh.dats %} call c_f_pointer(arg{{dat.arg_id}}%data, dat{{dat.id}}, (/{{dat_dim_w(dat)}}, getsetsizefromoparg(arg{{dat.arg_id}})/)) {{-"\n" if loop.last}} @@ -520,6 +519,7 @@ subroutine op2_k_{{lh.name}}{{variant}}( & dats_indirect & ) + call op_profile_next("MPI Wait") if ((set_size .eq. 0) .or. (set_size .eq. set%setptr%core_size)) then call op_mpi_wait_all(size(args), args) end if @@ -527,13 +527,10 @@ subroutine op2_k_{{lh.name}}{{variant}}( & {% for arg in lh.args|gbl|reduction %} call op_mpi_reduce_{{arg.typ.__repr__()}}(arg{{arg.id}}, arg{{arg.id}}%data){{"\n" if loop.last}} {% endfor %} - call op_mpi_set_dirtybit(size(args), args) - call op_timers_core(end_time) + call op_profile_exit() - ! todo: review kernel transfer calculation - transfer = 0.0 - - call setkerneltime({{kernel_idx}}, name // c_null_char, end_time - start_time, transfer, 0.0, 1) + call op_mpi_set_dirtybit(size(args), args) + call op_profile_exit() end subroutine end module diff --git a/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja index 7e4aadfca..c7fbea805 100644 --- a/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja +++ b/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja @@ -128,9 +128,9 @@ subroutine op2_k_{{lh.name}}_wr( & do n = 1, n_exec {% if lh is indirect %} if (n == set%setptr%core_size + 1) then - call op_timing2_next("MPI Wait") + call op_profile_next("MPI Wait") call op_mpi_wait_all(size(args), args) - call op_timing2_next("Computation") + call op_profile_next("Computation") end if {% endif %} @@ -204,12 +204,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( & args({{arg.id + 1}}) = arg{{arg.id}} {% endfor %} - call op_timing2_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}") + call op_profile_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}") - call op_timing2_enter("MPI Exchanges") + call op_profile_enter("MPI Exchanges") n_exec = op_mpi_halo_exchanges(set%setcptr, size(args), args) - call op_timing2_next("Computation") + call op_profile_next("Computation") {% for dat in lh.dats %} call c_f_pointer(arg{{dat.arg_id}}%data, dat{{dat.id}}, (/{{dat_dim(dat)}}, getsetsizefromoparg(arg{{dat.arg_id}})/)) @@ -246,22 +246,22 @@ subroutine op2_k_{{lh.name}}{{variant}}( & args & ) - call op_timing2_next("MPI Wait") + call op_profile_next("MPI Wait") if ((n_exec == 0) .or. (n_exec == set%setptr%core_size)) then call op_mpi_wait_all(size(args), args) end if {% if lh.args|gbl|reduction|length > 0 %} - call op_timing2_next("MPI Reduce") + call op_profile_next("MPI Reduce") {% for arg in lh.args|gbl|reduction %} call op_mpi_reduce_{{arg.typ.__repr__()}}(arg{{arg.id}}, arg{{arg.id}}%data){{"\n" if loop.last}} {% endfor %} {% endif %} - call op_timing2_exit() + call op_profile_exit() call op_mpi_set_dirtybit(size(args), args) - call op_timing2_exit() + call op_profile_exit() end subroutine end module