diff --git a/.github/workflows/local_ci.yml b/.github/workflows/local_ci.yml
index 05e47ceef..7de0dd3ac 100644
--- a/.github/workflows/local_ci.yml
+++ b/.github/workflows/local_ci.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
-          ref: OP2_refactor
+          token: ${{ github.token }}
 
       # 2. Debug info ---------------------------------------
       - name: Print runner info
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 845303233..c8b2c61fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,7 +43,7 @@ Major changes since `v1.1.0` (high-level).
 - `op_get_global_set_offset` return type changed from `int` to `idx_g_t`.
 - `SafeLong` debug wrapper (`op2/include/SafeLong.h`, `op2/src/core/SafeLong.cpp`): optional arithmetic overflow/underflow checker for `idx_g_t`, enabled via `-DUSE_SAFELONG`.
 - `op_arg_idx` / `op_arg_info` support in C (previously Fortran only); 2-dim map variant added for Fortran.
-- `op_timing2`: improved timing and instrumentation API (`op2/include/op_timing2.h`).
+- **`op_profile`**: tree-based timing and instrumentation API (`op2/include/op_profile.h`), replacing the interim `op_timing2` name. Functions: `op_profile_start`, `op_profile_enter`, `op_profile_enter_kernel`, `op_profile_next`, `op_profile_exit`, `op_profile_end`, `op_profile_output`, `op_profile_output_json`. Controlled by `OP_PROFILE_LEVEL` (0–3) and `OP_PROFILE_JSON_OUTPUT` environment variables. Fortran bindings provided with identical names.
 - `op_mpi_probe_halo_index`, `op_force_part`: new MPI utility routines.
 - `op_reset_data_ptr` `real(4)` variants added (Fortran).
 - `op_get_global_set_offset` and Fortran bindings for `op_mpi_get_data`.
diff --git a/CODEBASE_OVERVIEW.md b/CODEBASE_OVERVIEW.md
index 2bd3d31b5..15a5ee77f 100644
--- a/CODEBASE_OVERVIEW.md
+++ b/CODEBASE_OVERVIEW.md
@@ -110,7 +110,7 @@ OP2-Common/
 | `op_mpi_core.h` | MPI halo data structures: `halo_list`, import/export lists, MPI comms |
 | `op_lib_mpi.h` | MPI runtime state: exec/non-exec halo lists, partition tables |
 | `op_hdf5.h` | Parallel HDF5 I/O API |
-| `op_timing2.h` | Tree-based timing instrumentation (JSON output, 4 detail levels) |
+| `op_profile.h` | Tree-based timing instrumentation (JSON output, 4 detail levels) |
 | `op_util.h` | Utility functions |
 | `SafeLong.h` | Debug wrapper type `SafeLong` for `idx_g_t` — detects integer overflow/underflow at runtime (enabled via `-DUSE_SAFELONG`) |
 | `fortran/` | Fortran C-interop headers |
diff --git a/apps/c/aero/aero_hdf5/aero.cpp b/apps/c/aero/aero_hdf5/aero.cpp
index db75acfcc..f8db3639c 100644
--- a/apps/c/aero/aero_hdf5/aero.cpp
+++ b/apps/c/aero/aero_hdf5/aero.cpp
@@ -46,6 +46,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16],
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -202,8 +203,7 @@ int main(int argc, char **argv) {
   ncell = op_get_size(cells);
   nbnodes = op_get_size(bnodes);
 
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Aero");
 
   // main time-marching loop
 
@@ -324,8 +324,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timing_output();
-  op_timers(&cpu_t2, &wall_t2);
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
   op_exit();
 }
diff --git a/apps/c/aero/aero_plain/aero.cpp b/apps/c/aero/aero_plain/aero.cpp
index 7404121c9..7e46ce7cb 100644
--- a/apps/c/aero/aero_plain/aero.cpp
+++ b/apps/c/aero/aero_plain/aero.cpp
@@ -46,6 +46,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16],
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -263,8 +264,7 @@ int main(int argc, char **argv) {
 
   op_diagnostic_output();
 
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Aero");
 
   // main fixpoint iteration loop
 
@@ -387,8 +387,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
   op_exit();
 }
diff --git a/apps/c/aero/aero_plain/aero_mpi.cpp b/apps/c/aero/aero_plain/aero_mpi.cpp
index af8fbddb1..1de71bffd 100644
--- a/apps/c/aero/aero_plain/aero_mpi.cpp
+++ b/apps/c/aero/aero_plain/aero_mpi.cpp
@@ -53,6 +53,7 @@ double gm1, gm1i, wtg1[2], xi1[2], Ng1[4], Ng1_xi[4], wtg2[4], Ng2[16],
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -147,7 +148,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *bnode, *cell, *g_bnode, *g_cell;
   double *xm, *g_xm;
@@ -369,7 +369,7 @@ int main(int argc, char **argv) {
 
   niter = 20;
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Aero");
   for (int iter = 1; iter <= niter; iter++) {
 
     op_par_loop(res_calc, "res_calc", cells,
@@ -486,8 +486,7 @@ int main(int argc, char **argv) {
       }
     }
   }
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp b/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp
index 161526bcf..182bad42a 100644
--- a/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_hdf5/dp/airfoil.cpp
@@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -83,7 +84,6 @@ int main(int argc, char **argv) {
   double rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // set constants and initialise flow field and residual
   op_printf("initialising flow field \n");
@@ -158,7 +158,7 @@ int main(int argc, char **argv) {
   int g_ncell = op_get_size(cells);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
 
@@ -240,7 +240,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // write given op_dat's indicated segment of data to a memory block in the
   // order it was originally
@@ -262,7 +262,6 @@ int main(int argc, char **argv) {
   // compress using
   // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5
 
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_output();
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp b/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp
index bad85a2a5..e8350d030 100644
--- a/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_hdf5/sp/airfoil.cpp
@@ -55,6 +55,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4];
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -76,7 +77,6 @@ int main(int argc, char **argv) {
   float rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // set constants and initialise flow field and residual
   op_printf("initialising flow field \n");
@@ -129,7 +129,7 @@ int main(int argc, char **argv) {
   int g_ncell = op_get_size(cells);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
 
@@ -210,9 +210,8 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_output();
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp b/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp
index 6353a942c..1ff154caa 100644
--- a/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_plain/dp/airfoil.cpp
@@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -79,7 +80,6 @@ int main(int argc, char **argv) {
   double rms, maxerr;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // read in grid
 
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
 
@@ -296,7 +296,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // output the result dat array to files
   op_print_dat_to_txtfile(p_q, "out_grid_seq.dat"); // ASCI
@@ -309,8 +309,7 @@ int main(int argc, char **argv) {
   op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells) - 1);
   free(q_part);
 
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_output();
 
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp
index 5564a18eb..102259a6c 100644
--- a/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp
+++ b/apps/c/airfoil/airfoil_plain/dp/airfoil_mpi.cpp
@@ -63,6 +63,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -154,7 +155,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *becell, *ecell, *bound, *bedge, *edge, *cell;
   double *x, *q, *qold, *adt, *res;
@@ -164,7 +164,7 @@ int main(int argc, char **argv) {
 
   /**------------------------BEGIN I/O and PARTITIONING -------------------**/
 
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   /* read in grid from disk on root processor */
   FILE *fp;
@@ -310,9 +310,6 @@ int main(int argc, char **argv) {
     free(g_res);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_printf("Max total file read time = %f\n", wall_t2 - wall_t1);
-
   /**------------------------END I/O and PARTITIONING -----------------------**/
 
   // declare sets, pointers, datasets and global constants
@@ -361,7 +358,6 @@ int main(int argc, char **argv) {
   op_partition("PARMETIS", "KWAY", cells, pecell, p_x);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
 
   niter = 1000;
   for (int iter = 1; iter <= niter; iter++) {
@@ -440,7 +436,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // output the result dat array to files
   op_print_dat_to_txtfile(p_q, "out_grid_mpi.dat"); // ASCI
@@ -453,8 +449,7 @@ int main(int argc, char **argv) {
   op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells) - 1);
   free(q_part);
 
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_output();
 
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp b/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp
index 67f287ee3..d96dbe43f 100644
--- a/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_plain/sp/airfoil.cpp
@@ -55,6 +55,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4];
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -79,7 +80,6 @@ int main(int argc, char **argv) {
   float rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // read in grid
 
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
 
@@ -291,9 +291,8 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp
index 3561a8196..8af10ea67 100644
--- a/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp
+++ b/apps/c/airfoil/airfoil_plain/sp/airfoil_mpi.cpp
@@ -63,6 +63,7 @@ float gam, gm1, cfl, eps, mach, alpha, qinf[4];
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -153,7 +154,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *becell, *ecell, *bound, *bedge, *edge, *cell;
   float *x, *q, *qold, *adt, *res;
@@ -163,7 +163,7 @@ int main(int argc, char **argv) {
 
   /**------------------------BEGIN I/O and PARTITIONING -------------------**/
 
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   /* read in grid from disk on root processor */
   FILE *fp;
@@ -309,9 +309,6 @@ int main(int argc, char **argv) {
     free(g_res);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_printf("Max total file read time = %f\n", wall_t2 - wall_t1);
-
   /**------------------------END I/O and PARTITIONING -----------------------**/
 
   // declare sets, pointers, datasets and global constants
@@ -359,7 +356,6 @@ int main(int argc, char **argv) {
   op_partition("PTSCOTCH", "KWAY", NULL, pecell, p_x);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
 
   niter = 1000;
   for (int iter = 1; iter <= niter; iter++) {
@@ -432,7 +428,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // get results data array - perhaps can be later handled by a remporary dat
   // op_dat temp = op_mpi_get_data(p_q);
@@ -441,9 +437,8 @@ int main(int argc, char **argv) {
   // print_dat_tofile(temp, "out_grid.dat"); //ASCI
   // print_dat_tobinfile(temp, "out_grid.bin"); //Binary
 
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp b/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp
index f4d7f39c3..e1e970fc9 100644
--- a/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_tempdats/dp/airfoil.cpp
@@ -55,6 +55,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -79,7 +80,6 @@ int main(int argc, char **argv) {
   double rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // read in grid
 
@@ -214,7 +214,7 @@ int main(int argc, char **argv) {
   double g_ncell = op_get_size(cells);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
 
@@ -306,9 +306,8 @@ int main(int argc, char **argv) {
       op_printf("Error: temporary op_dat %s cannot be removed\n", p_qold->name);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp b/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp
index 50558a8e0..1c16b9152 100644
--- a/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp
+++ b/apps/c/airfoil/airfoil_tempdats/dp/airfoil_mpi.cpp
@@ -63,6 +63,7 @@ double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -154,7 +155,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *becell, *ecell, *bound, *bedge, *edge, *cell;
   double *x, *q, *qold, *adt, *res;
@@ -164,7 +164,7 @@ int main(int argc, char **argv) {
 
   /**------------------------BEGIN I/O and PARTITIONING -------------------**/
 
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   /* read in grid from disk on root processor */
   FILE *fp;
@@ -310,9 +310,6 @@ int main(int argc, char **argv) {
     free(g_res);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_printf("Max total file read time = %f\n", wall_t2 - wall_t1);
-
   /**------------------------END I/O and PARTITIONING -----------------------**/
 
   // declare sets, pointers, datasets and global constants
@@ -360,7 +357,6 @@ int main(int argc, char **argv) {
   op_partition("PTSCOTCH", "KWAY", cells, pecell, p_x);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
 
   niter = 1000;
   for (int iter = 1; iter <= niter; iter++) {
@@ -446,10 +442,9 @@ int main(int argc, char **argv) {
       op_printf("Error: temporary op_dat %s cannot be removed\n", p_qold->name);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
+  op_profile_end();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
   op_exit();
 }
diff --git a/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp b/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp
index fa8e02762..4fb3e1022 100644
--- a/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp
+++ b/apps/c/airfoil/airfoil_tutorial/final/airfoil.cpp
@@ -4,6 +4,7 @@
 #include <sys/time.h>
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 /* Problem mesh and iterations */
 #define FILE_NAME_PATH "new_grid.h5"
@@ -33,7 +34,6 @@ int main(int argc, char **argv) {
   double rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // Load unstructured mesh
   op_printf("***** Load mesh and initialization *****\n");
@@ -86,7 +86,7 @@ int main(int argc, char **argv) {
   op_partition("BLOCK", "ANY", edges, pecell, p_x);
 
   //start timer
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
   op_printf("***** Start Main iteration *************\n");
@@ -158,12 +158,8 @@ int main(int argc, char **argv) {
   }
 
   //end timer
-  op_timers(&cpu_t2, &wall_t2);
-
-  // compute and print wall time
-  double walltime = wall_t2 - wall_t1;
-
-  op_printf(" Wall time %lf \n", walltime);
+  op_profile_end();
+  op_profile_output();
 
   //Finalising the OP2 library
   op_exit();
diff --git a/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp b/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp
index e7e64af1e..4cbdbb17f 100644
--- a/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp
+++ b/apps/c/airfoil/airfoil_tutorial/step6/airfoil_step6.cpp
@@ -4,6 +4,7 @@
 #include <sys/time.h>
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 /* Problem mesh and iterations */
 #define FILE_NAME_PATH "new_grid.h5"
@@ -14,7 +15,7 @@
 double gam, gm1, cfl, eps, mach, alpha, qinf[4];
 
 /* wall timer routine */
-// Now done using OP2's internal op_timers() call
+// Now done using OP2's internal op_profile API
 
 //outlined elemental kernel - save_soln
 inline void save_soln(const double *q, double *qold) {
@@ -150,7 +151,6 @@ int main(int argc, char **argv) {
   double rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // Load unstructured mesh
   op_printf("***** Load mesh and initialization *****\n");
@@ -203,7 +203,7 @@ int main(int argc, char **argv) {
   op_partition("BLOCK", "ANY", edges, pecell, p_x);
 
   //start timer
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
   op_printf("***** Start Main iteration *************\n");
@@ -275,12 +275,8 @@ int main(int argc, char **argv) {
   }
 
   //end timer
-  op_timers(&cpu_t2, &wall_t2);
-
-  // compute and print wall time
-  double walltime = wall_t2 - wall_t1;
-
-  op_printf(" Wall time %lf \n", walltime);
+  op_profile_end();
+  op_profile_output();
 
   //Finalising the OP2 library
   op_exit();
diff --git a/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp b/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp
index fa8e02762..4fb3e1022 100644
--- a/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp
+++ b/apps/c/airfoil/airfoil_tutorial/step7/airfoil_step7.cpp
@@ -4,6 +4,7 @@
 #include <sys/time.h>
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 /* Problem mesh and iterations */
 #define FILE_NAME_PATH "new_grid.h5"
@@ -33,7 +34,6 @@ int main(int argc, char **argv) {
   double rms;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // Load unstructured mesh
   op_printf("***** Load mesh and initialization *****\n");
@@ -86,7 +86,7 @@ int main(int argc, char **argv) {
   op_partition("BLOCK", "ANY", edges, pecell, p_x);
 
   //start timer
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Airfoil");
 
   // main time-marching loop
   op_printf("***** Start Main iteration *************\n");
@@ -158,12 +158,8 @@ int main(int argc, char **argv) {
   }
 
   //end timer
-  op_timers(&cpu_t2, &wall_t2);
-
-  // compute and print wall time
-  double walltime = wall_t2 - wall_t1;
-
-  op_printf(" Wall time %lf \n", walltime);
+  op_profile_end();
+  op_profile_output();
 
   //Finalising the OP2 library
   op_exit();
diff --git a/apps/c/jac1/dp/jac.cpp b/apps/c/jac1/dp/jac.cpp
index 8f5c52057..af84817cc 100644
--- a/apps/c/jac1/dp/jac.cpp
+++ b/apps/c/jac1/dp/jac.cpp
@@ -56,6 +56,7 @@ double alpha;
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -80,7 +81,6 @@ int main(int argc, char **argv) {
   op_init(argc, argv, 5);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int nnode, nedge, n, e;
 
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC");
 
   // main iteration loop
 
@@ -181,7 +181,7 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // print out results
   op_printf("\n  Results after %d iterations:\n\n", NITER);
@@ -203,10 +203,9 @@ int main(int argc, char **argv) {
     op_printf("\n");
   }
 
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   int result = check_result<double>(u, NN, TOLERANCE);
   op_exit();
diff --git a/apps/c/jac1/dp/jac_mpi.cpp b/apps/c/jac1/dp/jac_mpi.cpp
index 9fea6854a..77cd9e620 100644
--- a/apps/c/jac1/dp/jac_mpi.cpp
+++ b/apps/c/jac1/dp/jac_mpi.cpp
@@ -63,6 +63,7 @@ double alpha;
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -151,7 +152,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *pp;
   double *A, *r, *u, *du;
@@ -274,7 +274,7 @@ int main(int argc, char **argv) {
   op_partition("PARMETIS", "KWAY", edges, ppedge, NULL);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC");
 
   // main iteration loop
 
@@ -304,7 +304,7 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / g_nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // get results data array
   op_fetch_data(p_u, u);
@@ -321,10 +321,9 @@ int main(int argc, char **argv) {
   printf("\n");
 
   // print each mpi process's timing info for each kernel
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   // gather results from all ranks and check
   double *ug = (double *)malloc(sizeof(double) * op_get_size(nodes));
diff --git a/apps/c/jac1/longint/jac_mpi.cpp b/apps/c/jac1/longint/jac_mpi.cpp
index eae5cd361..8cd312f46 100644
--- a/apps/c/jac1/longint/jac_mpi.cpp
+++ b/apps/c/jac1/longint/jac_mpi.cpp
@@ -59,6 +59,7 @@ double alpha;
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -185,7 +186,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   idx_g_t *pp;
   double *A, *r, *u, *du;
@@ -330,7 +330,7 @@ remains consistent. */
   op_partition("PARMETIS", "KWAY", edges, ppedge, p_coords);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC");
 
   // main iteration loop
 
@@ -360,7 +360,7 @@ remains consistent. */
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / (double)(size_t)g_nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // output the result dat array to files
   // op_print_dat_to_txtfile(p_u, "out_grid_mpi.dat"); // ASCI
@@ -374,10 +374,9 @@ remains consistent. */
   // printf("\n");
 
   // print each mpi process's timing info for each kernel
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   // fetch local results
   op_fetch_data(p_u, u);
diff --git a/apps/c/jac1/sp/jac.cpp b/apps/c/jac1/sp/jac.cpp
index 9551fb290..6deb4b7a2 100644
--- a/apps/c/jac1/sp/jac.cpp
+++ b/apps/c/jac1/sp/jac.cpp
@@ -56,6 +56,7 @@ float alpha;
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -80,7 +81,6 @@ int main(int argc, char **argv) {
   op_init(argc, argv, 5);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int nnode, nedge, n, e;
 
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC");
 
   // main iteration loop
 
@@ -177,7 +177,7 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // print out results
 
@@ -200,10 +200,9 @@ int main(int argc, char **argv) {
     op_printf("\n");
   }
 
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   int result = check_result<float>(u, NN, TOLERANCE);
   op_exit();
diff --git a/apps/c/jac1/sp/jac_mpi.cpp b/apps/c/jac1/sp/jac_mpi.cpp
index 575daade7..34e76b9cf 100644
--- a/apps/c/jac1/sp/jac_mpi.cpp
+++ b/apps/c/jac1/sp/jac_mpi.cpp
@@ -63,6 +63,7 @@ float alpha;
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -150,7 +151,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *pp;
   float *A, *r, *u, *du;
@@ -273,7 +273,7 @@ int main(int argc, char **argv) {
   op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC");
 
   // main iteration loop
 
@@ -298,7 +298,7 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / g_nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // get results data array
   op_fetch_data(p_u, u);
@@ -315,10 +315,9 @@ int main(int argc, char **argv) {
   printf("\n");
 
   // print each mpi process's timing info for each kernel
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   // gather results from all ranks and check
   float *ug = (float *)malloc(sizeof(float) * op_get_size(nodes));
diff --git a/apps/c/jac2/jac.cpp b/apps/c/jac2/jac.cpp
index f7a2cc918..aeed884a8 100644
--- a/apps/c/jac2/jac.cpp
+++ b/apps/c/jac2/jac.cpp
@@ -52,6 +52,7 @@ float alpha;
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 // jac header file
 
@@ -81,7 +82,6 @@ int main(int argc, char **argv) {
   op_init(argc, argv, 5);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int nnode, nedge, n, e;
   float dx;
@@ -171,7 +171,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC2");
 
   // main iteration loop
 
@@ -195,7 +195,7 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   // print out results
   op_printf("\n  Results after %d iterations:\n\n", NITER);
@@ -218,10 +218,9 @@ int main(int argc, char **argv) {
     op_printf("\n");
   }
 
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   int result = check_result<float>(u, NN, TOLERANCE);
   op_exit();
diff --git a/apps/c/jac2/jac_mpi.cpp b/apps/c/jac2/jac_mpi.cpp
index d8941a611..21c9431e4 100644
--- a/apps/c/jac2/jac_mpi.cpp
+++ b/apps/c/jac2/jac_mpi.cpp
@@ -57,6 +57,7 @@ float alpha;
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 // jac header file
 
@@ -171,7 +172,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   /**------------------------BEGIN I/O and PARTITIONING ---------------------**/
   int g_nnode, g_nedge, g_n, g_e;
@@ -298,7 +298,7 @@ int main(int argc, char **argv) {
   op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL);
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("JAC2");
 
   // main iteration loop
 
@@ -322,15 +322,14 @@ int main(int argc, char **argv) {
     op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode));
   }
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
   op_fetch_data(p_u, u);
   op_print_dat_to_txtfile(p_u, "out_grid_mpi.dat");
 
-  op_timing_output();
+  op_profile_output();
 
   // print total time for niter interations
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
 
   float *ug = (float *)malloc(sizeof(float) * op_get_size(nodes) * 2);
   op_fetch_data_idx(p_u, ug, 0, op_get_size(nodes) - 1);
diff --git a/apps/c/reduction/reduction.cpp b/apps/c/reduction/reduction.cpp
index f3d3f804f..5eebfe9fe 100644
--- a/apps/c/reduction/reduction.cpp
+++ b/apps/c/reduction/reduction.cpp
@@ -48,6 +48,7 @@
 //
 
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -68,7 +69,6 @@ int main(int argc, char **argv) {
   int nnode, ncell, nedge, nbedge;
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   // read in airfoil grid
 
@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Reduction");
 
   // indirect reduction
   count1 = 0;
@@ -173,10 +173,8 @@ int main(int argc, char **argv) {
   else
     op_printf("Reduction application FAILED\n");
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_timing_output();
-
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 
diff --git a/apps/c/reduction/reduction_mpi.cpp b/apps/c/reduction/reduction_mpi.cpp
index 7918bca4d..eeb2ba44f 100644
--- a/apps/c/reduction/reduction_mpi.cpp
+++ b/apps/c/reduction/reduction_mpi.cpp
@@ -51,6 +51,7 @@
 
 #include "op_lib_mpi.h"
 #include "op_seq.h"
+#include <op_profile.h>
 
 //
 // kernel routines for parallel loops
@@ -139,7 +140,6 @@ int main(int argc, char **argv) {
   MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 
   // timer
-  double cpu_t1, cpu_t2, wall_t1, wall_t2;
 
   int *becell, *ecell, *bound, *bedge, *edge, *cell;
   double *x, *q, *qold, *adt, *res;
@@ -148,7 +148,7 @@ int main(int argc, char **argv) {
 
   /**------------------------BEGIN I/O and PARTITIONING -------------------**/
 
-  op_timers(&cpu_t1, &wall_t1);
+  op_profile_start("Reduction");
 
   /* read in grid from disk on root processor */
   FILE *fp;
@@ -267,9 +267,6 @@ int main(int argc, char **argv) {
     free(g_res);
   }
 
-  op_timers(&cpu_t2, &wall_t2);
-  op_printf("Max total file read time = %f\n", wall_t2 - wall_t1);
-
   /**------------------------END I/O and PARTITIONING -----------------------**/
 
   op_set edges = op_decl_set(nedge, "edges");
@@ -286,7 +283,6 @@ int main(int argc, char **argv) {
   op_diagnostic_output();
 
   // initialise timers for total execution wall time
-  op_timers(&cpu_t1, &wall_t1);
 
   // indirect reduction
   count = 0;
@@ -309,10 +305,9 @@ int main(int argc, char **argv) {
   else
     op_printf("direct reduction PASSED\n");
 
-  op_timers(&cpu_t2, &wall_t2);
+  op_profile_end();
 
-  op_timing_output();
-  op_printf("Max total runtime = %f\n", wall_t2 - wall_t1);
+  op_profile_output();
 
   op_exit();
 
diff --git a/apps/fortran/airfoil/airfoil.F90 b/apps/fortran/airfoil/airfoil.F90
index db2d47fba..d7d160fae 100644
--- a/apps/fortran/airfoil/airfoil.F90
+++ b/apps/fortran/airfoil/airfoil.F90
@@ -63,7 +63,7 @@ program airfoil
 #endif
 
     call op_init_base(0, 0)
-    call op_timing2_start("Airfoil")
+    call op_profile_start("Airfoil")
 
 #ifdef HDF5
     call op_print("Declaring OP2 sets (HDF5)")
@@ -121,7 +121,7 @@ program airfoil
     call op_decl_const(qinf, 4, "real(8)")
 
     call op_partition("PARMETIS", "KWAY", edges, pecell, p_x)
-    call op_timing2_enter("Main computation")
+    call op_profile_enter("Main computation")
 
     call op_decl_dat_temp(cells, 4, "real(8)", p_res, "p_res")
 
@@ -184,10 +184,10 @@ program airfoil
 
     iter = op_free_dat_temp(p_res)
 
-    call op_timing2_finish()
+    call op_profile_end()
 
     if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
     if (op_is_root() == 1 .and. niter == 1000 .and. ncell_total == 720000) then
         diff = abs((100.0_8 * (rms(2) / 0.0001060114637578_8)) - 100.0_8)
diff --git a/apps/fortran/jac1/jac.F90 b/apps/fortran/jac1/jac.F90
index 5ff5f4985..33fe0b7ac 100644
--- a/apps/fortran/jac1/jac.F90
+++ b/apps/fortran/jac1/jac.F90
@@ -58,7 +58,7 @@ program jac
     alpha = 1.0
 
     call op_decl_const(alpha, 1, "real(8)")
-    call op_timing2_start("JAC")
+    call op_profile_start("JAC")
 
     beta = 1.0
 
@@ -82,10 +82,10 @@ program jac
         write (*, "(1X, A, F7.4, A, F10.8)") "u max = ", u_max, "; u rms = ", sqrt(u_sum / nnode)
     end do
 
-    call op_timing2_finish()
+    call op_profile_end()
 
     print *
-    call op_timing2_output()
+    call op_profile_output()
 
     allocate(u(nnode))
     call op_fetch_data(p_u, u)
diff --git a/apps/fortran/jac1_long/jac1_mpi.F90 b/apps/fortran/jac1_long/jac1_mpi.F90
index 3182e53dc..9c19bb380 100644
--- a/apps/fortran/jac1_long/jac1_mpi.F90
+++ b/apps/fortran/jac1_long/jac1_mpi.F90
@@ -191,8 +191,8 @@ program jac_distributed
   !--------------------------------------------------------------------------
   ! 5. Main Iteration Loop
   !--------------------------------------------------------------------------
-  call op_timing2_start("Jacobi")
-  call op_timing2_enter("Main computation") ! Start timing after setup/partitioning
+  call op_profile_start("Jacobi")
+  call op_profile_enter("Main computation") ! Start timing after setup/partitioning
 
   beta = 1.0_8
 
@@ -227,12 +227,12 @@ program jac_distributed
      end if
   end do
 
-  call op_timing2_finish() ! Stop timing
+  call op_profile_end() ! Stop timing
 
   !--------------------------------------------------------------------------
   ! 6. Output Timings and Fetch Results
   !--------------------------------------------------------------------------
-  call op_timing2_output()
+  call op_profile_output()
 
   ! Re-allocate u if it was deallocated earlier, or just use the existing one
   ! Ensure 'u' is allocated with the correct *local* size 'nnode'
diff --git a/apps/fortran/reduction/reduction.F90 b/apps/fortran/reduction/reduction.F90
index 32fac0cec..3709e641b 100644
--- a/apps/fortran/reduction/reduction.F90
+++ b/apps/fortran/reduction/reduction.F90
@@ -29,8 +29,6 @@ program reduction
     type(op_map) :: pecell
     type(op_dat) :: p_res, p_dummy
 
-    real(kind = c_double) :: start_time, end_time
-
     integer(4) :: i, cell_count_result, edge_count_result
 
 #ifndef HDF5
@@ -41,7 +39,7 @@ program reduction
 #endif
 
     call op_init_base(0, 0)
-    call op_timing2_start("Reduction")
+    call op_profile_start("Reduction")
 
 #ifndef HDF5
     open(file_id, file = file_name)
@@ -90,7 +88,6 @@ program reduction
 #endif
 
     call op_partition("PTSCOTCH", "KWAY", edges, pecell, p_dummy)
-    call op_timers(start_time)
 
     ncell_total = op_get_size(cells)
     nedge_total = op_get_size(edges)
@@ -106,8 +103,6 @@ program reduction
         op_arg_dat(p_res, 1, pecell, 4, "real(8)", OP_RW), &
         op_arg_gbl(edge_count_result, 1, "integer(4)", OP_INC))
 
-    call op_timers(end_time)
-    call op_timing_output()
 
     if (op_is_root() == 1) then
         print *
@@ -122,13 +117,12 @@ program reduction
         end if
 
         print *
-        print *, 'Time = ', end_time - start_time, 'seconds'
     end if
 
-    call op_timing2_finish()
+    call op_profile_end()
 
     if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
     call op_exit()
 
diff --git a/docs/api.rst b/docs/api.rst
index 96958cdaa..95e20e1b4 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -445,6 +445,67 @@ Other I/O and Utilities
 
    :param file_name: The name of the CSV file to write.
 
+Profiling API
+^^^^^^^^^^^^^
+
+The ``op_profile`` API (``op2/include/op_profile.h``) provides a tree-based timing and instrumentation facility for OP2 applications. It replaces the older ``op_timers`` / ``op_timing_output`` pattern with a structured, hierarchical timing tree that can be printed to stdout or exported to JSON.
+
+The translator-generated kernel code automatically inserts ``op_profile_enter_kernel`` calls; application code uses the simpler ``op_profile_start`` / ``op_profile_enter`` / ``op_profile_end`` functions to time outer sections.
+
+Typical usage::
+
+   op_profile_start("MyApp");     // initialise, name the root
+   op_profile_enter("Setup");     // begin a named section
+   /* ... setup work ... */
+   op_profile_next("Computation"); // exit "Setup", enter "Computation"
+   /* ... main loop ... */
+   op_profile_end();              // close all open sections
+   op_profile_output();           // print summary; write JSON if OP_PROFILE_JSON_OUTPUT is set
+
+.. c:function:: void op_profile_start(const char *name)
+
+   Initialise the profiling system and set the root application name. Must be called before any other ``op_profile_*`` function. Reads ``OP_PROFILE_LEVEL`` from the environment at this point.
+
+   :param name: A descriptive name for the application (used as the root label in the timing tree).
+
+.. c:function:: void op_profile_enter(const char *name)
+
+   Begin a named timing section, pushing it onto the timing stack. Device synchronisation is performed before the timer starts (for accurate GPU timings).
+
+   :param name: A descriptive name for the section.
+
+.. c:function:: void op_profile_enter_kernel(const char *name, const char *target, const char *variant)
+
+   Enter a kernel-level timing section. Called automatically by translator-generated kernel code; do not call manually in application code.
+
+   :param name: Kernel name.
+   :param target: Backend target string (e.g. ``"cuda"``, ``"openmp"``).
+   :param variant: Kernel variant string.
+
+.. c:function:: void op_profile_next(const char *name)
+
+   Convenience helper equivalent to calling :c:func:`op_profile_exit` followed immediately by :c:func:`op_profile_enter` with the new name.
+
+   :param name: The name of the next section to enter.
+
+.. c:function:: void op_profile_exit()
+
+   Close the innermost open timing section, recording elapsed time.
+
+.. c:function:: void op_profile_end()
+
+   End profiling, closing all remaining open sections.
+
+.. c:function:: void op_profile_output()
+
+   Pretty-print the timing tree to stdout. If the ``OP_PROFILE_JSON_OUTPUT`` environment variable is set, also writes the tree to that file in JSON format. For MPI builds, timing trees are collected from all ranks and combined before printing.
+
+.. c:function:: void op_profile_output_json(const char *filename)
+
+   Write the timing tree to *filename* in JSON format. For MPI builds, trees from all ranks are combined first.
+
+   :param filename: Path to the output JSON file.
+
 .. c:function:: void op_diagnostic_output()
 
    This routine prints diagnostics relating to sets, mappings and datasets.
@@ -468,6 +529,10 @@ The following environment variables can be set at run time to control OP2 behavi
      - Integer. Caps the maximum number of OpenMP threads used inside JIT-compiled (``c_cuda`` / ``c_hip``) host loops. Useful for tuning thread counts independently of ``OMP_NUM_THREADS``.
    * - ``OP_FALLBACK_MODE``
      - Set to ``warn`` or ``error``. When the translator emits a fallback sequential kernel for a loop it cannot fully parallelise, ``warn`` prints a warning at runtime when the fallback executes; ``error`` aborts. Unset by default (fallback runs silently).
+   * - ``OP_PROFILE_LEVEL``
+     - Controls the detail level of the ``op_profile`` timing tree. ``0`` disables profiling entirely. ``1`` times only user-defined outer sections (no kernel timing). ``2`` adds whole-kernel timings (default). ``3`` includes detailed in-kernel section timings generated by the translator.
+   * - ``OP_PROFILE_JSON_OUTPUT``
+     - If set to a file path, :c:func:`op_profile_output` and :c:func:`op_profile_output_json` will write the timing tree to that file in JSON format.
 
 ----
 
@@ -629,8 +694,8 @@ Fortran application variants are prefixed with ``f_`` in the Make build system:
 
 For the translator invocation for Fortran sources, see :doc:`translator`.
 
-Timers
-^^^^^^
+Timers and Profiling
+^^^^^^^^^^^^^^^^^^^^
 
 .. code-block:: fortran
 
@@ -643,3 +708,17 @@ Timers
 
 .. note::
    Unlike the C/C++ :c:func:`op_timers`, the Fortran version takes only one argument (the wall-clock time ``et``); the ``cpu`` argument is omitted.
+
+The ``op_profile`` profiling API is also available in Fortran with identical semantics to the C/C++ interface:
+
+.. code-block:: fortran
+
+   call op_profile_start("MyApp")
+   call op_profile_enter("Setup")
+   ! ... setup work ...
+   call op_profile_next("Computation")
+   ! ... main loop ...
+   call op_profile_end()
+   call op_profile_output()
+
+The available Fortran subroutines mirror the C API: ``op_profile_start``, ``op_profile_enter``, ``op_profile_enter_kernel``, ``op_profile_next``, ``op_profile_exit``, ``op_profile_end``, ``op_profile_output``, and ``op_profile_output_json``. The ``OP_PROFILE_LEVEL`` and ``OP_PROFILE_JSON_OUTPUT`` environment variables apply in Fortran builds identically to C/C++ builds.
diff --git a/op2/Makefile b/op2/Makefile
index 648197902..9f670d7a5 100644
--- a/op2/Makefile
+++ b/op2/Makefile
@@ -111,7 +111,7 @@ OP2_FOR_HDF5 := $(OP2_HDF5) $(addprefix $(OBJ)/fortran/,\
 OP2_SEQ := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	core/op_dummy_singlenode.o \
 	sequential/op_seq.o \
-	externlib/op_timing2.o)
+	externlib/op_profile.o)
 
 OP2_FOR_SEQ := $(OP2_SEQ) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\
 	op_dummy_wrappers.o)
@@ -120,13 +120,13 @@ OP2_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	cuda/op_cuda_decl+cuda.o \
 	cuda/op_cuda_rt_support+cuda.o \
 	cuda/op2_cuda_rt_wrappers+cuda.o \
-	externlib/op_timing2.o)
+	externlib/op_profile.o)
 
 OP2_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	cuda/op_cuda_decl+hip.o \
 	cuda/op_cuda_rt_support+hip.o \
 	cuda/op2_cuda_rt_wrappers+hip.o \
-	externlib/op_timing2.o)
+	externlib/op_profile.o)
 
 OP2_FOR_CUDA := $(OP2_CUDA) $(OP2_FOR_BASE_CUDA) $(addprefix $(OBJ)/fortran/,\
 	cudaConfigurationParams.o)
@@ -137,7 +137,7 @@ OP2_FOR_HIP := $(OP2_HIP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\
 OP2_OPENMP := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	core/op_dummy_singlenode.o \
 	openmp/op_openmp_decl.o \
-	externlib/op_timing2.o)
+	externlib/op_profile.o)
 
 OP2_FOR_OPENMP := $(OP2_OPENMP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\
 	op_dummy_wrappers.o)
@@ -145,7 +145,7 @@ OP2_FOR_OPENMP := $(OP2_OPENMP) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\
 OP2_OPENMP4 := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	openmp4/op_openmp4_decl.o \
 	openmp4/op_openmp4_rt_support.o \
-	externlib/op_timing2.o)
+	externlib/op_profile.o)
 
 OP2_FOR_OPENMP4 := $(OP2_OPENMP4) $(OP2_FOR_BASE) $(addprefix $(OBJ)/fortran/,\
 	op_dummy_wrappers.o)
@@ -159,7 +159,7 @@ OP2_MPI := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	mpi/op_mpi_util.o \
 	externlib/op_util.o \
 	externlib/op_renumber.o \
-	externlib/op_timing2+mpi.o)
+	externlib/op_profile+mpi.o)
 
 OP2_FOR_MPI := $(OP2_MPI) $(OP2_FOR_BASE_MPI) $(addprefix $(OBJ)/fortran/,\
 	op_dummy_wrappers+mpi.o)
@@ -176,7 +176,7 @@ OP2_MPI_CUDA := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	mpi/op_mpi_util.o \
 	externlib/op_util.o \
 	externlib/op_renumber.o \
-	externlib/op_timing2+mpi.o)
+	externlib/op_profile+mpi.o)
 
 OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	cuda/op_cuda_rt_support+mpi+hip.o \
@@ -190,7 +190,7 @@ OP2_MPI_HIP := $(OP2_BASE) $(addprefix $(OBJ)/,\
 	mpi/op_mpi_util.o \
 	externlib/op_util.o \
 	externlib/op_renumber.o \
-	externlib/op_timing2+mpi.o)
+	externlib/op_profile+mpi.o)
 
 OP2_FOR_MPI_CUDA := $(OP2_MPI_CUDA) $(OP2_FOR_BASE_MPI_CUDA) $(addprefix $(OBJ)/fortran/,\
 	cudaConfigurationParams.o)
diff --git a/op2/include/op_f2c_helpers.h b/op2/include/op_f2c_helpers.h
index 077597826..fbf8888f8 100644
--- a/op2/include/op_f2c_helpers.h
+++ b/op2/include/op_f2c_helpers.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <extern/rapidhash.h>
-// #include <op_timing2.h>
+#include <op_profile.h>
 #include <op_gpu_shims.h>
 
 #include <array>
@@ -528,13 +528,13 @@ class KernelInfo {
 
     void invoke(JitKernel *kernel, int num_blocks, int block_size, void **args, void **args_jit) {
         if (kernel == nullptr) {
-            // op_timing2_next("Offline Kernel");
+            op_profile_next("Offline Kernel");
             invoke_offline(num_blocks, block_size, args);
 
             return;
         }
 
-        // op_timing2_next("JIT Kernel");
+        op_profile_next("JIT Kernel");
         kernel->invoke(num_blocks, block_size, args_jit);
     }
 };
diff --git a/op2/include/op_timing2.h b/op2/include/op_profile.h
similarity index 66%
rename from op2/include/op_timing2.h
rename to op2/include/op_profile.h
index b9be64cdf..14ba32a3c 100644
--- a/op2/include/op_timing2.h
+++ b/op2/include/op_profile.h
@@ -1,5 +1,5 @@
-#ifndef __OP_TIMING2_H
-#define __OP_TIMING2_H
+#ifndef __OP_PROFILE_H
+#define __OP_PROFILE_H
 
 #include <extern/json.hpp>
 
@@ -13,34 +13,34 @@
 using json = nlohmann::json;
 
 /*
- * Tree-based timing code for instrumentation of OP2 applications. See the op_timing2 class (and its C API) for the
+ * Tree-based timing code for instrumentation of OP2 applications. See the op_profile class (and its C API) for the
  * public API methods to use.
  *
  * The OP2 code-generator will generate calls in kernel code for _enter_kernel() and sections inside.
  *
  * Expected usage from application:
- *   - op_timing2_start(name)
- *     - op_timing2_enter(name) (Optional - use to time application sections and separate out kernels)
- *     - op_timing2_exit()
+ *   - op_profile_start(name)
+ *     - op_profile_enter(name) (Optional - use to time application sections and separate out kernels)
+ *     - op_profile_exit()
  *     - ...
- *   - op_timing2_finish()
- *   - op_timing2_output()
+ *   - op_profile_end()
+ *   - op_profile_output()
  *
  * Two environment variables can be used:
- *   - OP_TIMING2_LEVEL={0,1,2,3} - Set the timing detail level
+ *   - OP_PROFILE_LEVEL={0,1,2,3} - Set the timing detail level
  *     - 0: Disabled
  *     - 1: Only time outer sections from _enter() in the application code (no kernel timing)
  *     - 2: Time outer sections from application code and overall kernel timings (no in-kernel timing)
  *     - 3: Time outer sections and detailed kernel timings (from code-generated _enter() sections)
  *
- *   - OP_TIMING2_JSON_OUTPUT=<filename> - Output the timing tree to the specified file in JSON format during the
- *                                         call to op_timing2_output().
+ *   - OP_PROFILE_JSON_OUTPUT=<filename> - Output the timing tree to the specified file in JSON format during the
+ *                                         call to op_profile_output().
  */
 
-/* ----------------------------------------- op_timing2_clock ----------------------------------------- */
+/* ----------------------------------------- op_profile_clock ----------------------------------------- */
 
 /* Helper struct to hold the timing information for each tree node */
-struct op_timing2_clock {
+struct op_profile_clock {
   using clock = std::chrono::high_resolution_clock;
 
   std::size_t n = 0;
@@ -50,75 +50,75 @@ struct op_timing2_clock {
   clock::duration max;
 
   void submit(clock::duration duration);
-  op_timing2_clock& operator+=(const op_timing2_clock& other);
+  op_profile_clock& operator+=(const op_profile_clock& other);
 
   clock::duration average() const { return total / n; }
 };
 
 /* Format a duration auto-selecting units (s, ms, us, ns) into the specified string width */
-std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned width = 7);
+std::string format_duration(const op_profile_clock::clock::duration& d, unsigned width = 7);
 
-/* Convert a op_timing2_clock to a string, optionally with a parent clock reference that is used to calculate a 
+/* Convert a op_profile_clock to a string, optionally with a parent clock reference that is used to calculate a 
  * time percentage. The parent needs not be the actual parent of the node in the timing tree. */
-std::string to_string(const op_timing2_clock& clock,
-                      const std::optional<std::reference_wrapper<const op_timing2_clock>> parent = std::nullopt);
+std::string to_string(const op_profile_clock& clock,
+                      const std::optional<std::reference_wrapper<const op_profile_clock>> parent = std::nullopt);
 
 
 /* JSON conversion implementations */
-void to_json(json& j, const op_timing2_clock& clock);
-void from_json(const json& j, op_timing2_clock& clock);
+void to_json(json& j, const op_profile_clock& clock);
+void from_json(const json& j, op_profile_clock& clock);
 
-/* ----------------------------------------- op_timing2_node ----------------------------------------- */
+/* ----------------------------------------- op_profile_node ----------------------------------------- */
 
 /* Node type for the timing tree nodes */
-enum class op_timing2_node_type { standard, kernel };
+enum class op_profile_node_type { standard, kernel };
 
 /* Timing tree node */
-struct op_timing2_node {
+struct op_profile_node {
   std::string name;
 
-  op_timing2_node_type type = op_timing2_node_type::standard;
-  op_timing2_clock clock;
+  op_profile_node_type type = op_profile_node_type::standard;
+  op_profile_clock clock;
 
   std::size_t num_ranks = 1;
 
-  std::vector<op_timing2_node> children;
+  std::vector<op_profile_node> children;
 
-  op_timing2_node(): name{"unknown"} {}
-  op_timing2_node(std::string_view name): name{name} {}
+  op_profile_node(): name{"unknown"} {}
+  op_profile_node(std::string_view name): name{name} {}
 
   /* Check if the node has an immediate child of the given name (and type) */
   bool has_child(std::string_view name,
-      std::optional<op_timing2_node_type> child_type = std::nullopt);
+      std::optional<op_profile_node_type> child_type = std::nullopt);
 
   /* Get an immediate child of the node with the given name (and type). This will create a new node if a match is
    * not found. Child names must be unique, regardless of type */
-  op_timing2_node& get_child(std::string_view name,
-                             std::optional<op_timing2_node_type> child_type = std::nullopt);
+  op_profile_node& get_child(std::string_view name,
+                             std::optional<op_profile_node_type> child_type = std::nullopt);
 
   /* Get a child of the node with the given scope path (and type). The scope must contain at least one name, and
    * children will be created on demand if they do not exist. */
-  op_timing2_node& get_child(std::vector<std::string> scope,
-                             std::optional<op_timing2_node_type> child_type = std::nullopt);
+  op_profile_node& get_child(std::vector<std::string> scope,
+                             std::optional<op_profile_node_type> child_type = std::nullopt);
 
   /* Combine with another node, probably from another MPI rank, adding the timing statistics together. Children
    * present in the second node will be created and combined into this node */
-  op_timing2_node& operator+=(const op_timing2_node& other);
+  op_profile_node& operator+=(const op_profile_node& other);
 
   /* Pretty-print the node to stdout, with an optional parent for time percentage output from the clocks. The parent
    * needs not be the node's actual parent */
   void output(unsigned indent = 0,
-              const std::optional<std::reference_wrapper<const op_timing2_node>> parent = std::nullopt);
+              const std::optional<std::reference_wrapper<const op_profile_node>> parent = std::nullopt);
 };
 
 /* JSON conversion implementations */
-void to_json(json& j, const op_timing2_node& node);
-void from_json(const json& j, op_timing2_node& node);
+void to_json(json& j, const op_profile_node& node);
+void from_json(const json& j, op_profile_node& node);
 
-/* ----------------------------------------- op_timing2 ----------------------------------------- */
+/* ----------------------------------------- op_profile ----------------------------------------- */
 
 /* Timing detail level, set before timing init/start */
-enum class op_timing2_level {
+enum class op_profile_level {
   disabled = 0,    // No timing
   simple,          // Only user-defined outer sections, doesn't time kernels
   kernel,          // Includes whole-kernel timing, no in-kernel sections (default)
@@ -126,15 +126,15 @@ enum class op_timing2_level {
 };
 
 /* The timing class, instantiated as a singleton "timing" - interact through that */
-class op_timing2 {
+class op_profile {
 private:
-  op_timing2_level level = op_timing2_level::simple;
+  op_profile_level level = op_profile_level::simple;
 
-  std::vector<std::reference_wrapper<op_timing2_node>> current_scope;
-  std::vector<op_timing2_clock::clock::time_point> current_starts;
+  std::vector<std::reference_wrapper<op_profile_node>> current_scope;
+  std::vector<op_profile_clock::clock::time_point> current_starts;
   unsigned extra_depth = 0; // Depth into sections not enabled by the current level
 
-  op_timing2_node root;
+  op_profile_node root;
 
   bool started = false;
 
@@ -143,10 +143,10 @@ class op_timing2 {
 
 public:
   /* Returns the singleton timing instance */
-  static op_timing2& instance();
+  static op_profile& instance();
 
-  /* Sets the timing level (only before calling start()) - see op_timing2_level */
-  void set_level(const op_timing2_level new_level);
+  /* Sets the timing level (only before calling start()) - see op_profile_level */
+  void set_level(const op_profile_level new_level);
 
   /* Initialises timing (with application name), required unless level set to disabled */
   void start(std::string_view name);
@@ -166,7 +166,7 @@ class op_timing2 {
   /* End timing, closing all remaining open sections. */
   void finish();
 
-  /* Pretty-print timing statistics, and output JSON to OP_TIMING2_JSON_OUTPUT if it's defined. The timing trees
+  /* Pretty-print timing statistics, and output JSON to OP_PROFILE_JSON_OUTPUT if it's defined. The timing trees
    * will be combined across MPI ranks the first time this is called if needed. */
   void output();
 
@@ -184,7 +184,7 @@ class op_timing2 {
   void print_summary();
 
   /* Pretty-prints the non-kernel nodes, accumulating a list of nodes that have immediate kernel children */
-  void print_walk_non_kernel(const op_timing2_node& node,
+  void print_walk_non_kernel(const op_profile_node& node,
                              const std::vector<std::string>& parent_path,
                              std::vector<std::vector<std::string>>& nodes_with_kernels,
                              unsigned indent = 0);
@@ -193,21 +193,21 @@ class op_timing2 {
   void print_kernel_summary(const std::vector<std::string>& path, unsigned longest_name);
 };
 
-/* C/Fortran timing API functions wrapping the public op_timing2 methods */
+/* C/Fortran timing API functions wrapping the public op_profile methods */
 extern "C" {
 
-void op_timing2_start(const char* name);
+void op_profile_start(const char* name);
 
-void op_timing2_enter(const char* name);
-void op_timing2_enter_kernel(const char* name, const char* target, const char* variant);
+void op_profile_enter(const char* name);
+void op_profile_enter_kernel(const char* name, const char* target, const char* variant);
 
-void op_timing2_next(const char* name);
+void op_profile_next(const char* name);
 
-void op_timing2_exit();
-void op_timing2_finish();
+void op_profile_exit();
+void op_profile_end();
 
-void op_timing2_output();
-void op_timing2_output_json(const char* filename);
+void op_profile_output();
+void op_profile_output_json(const char* filename);
 
 }
 
diff --git a/op2/src/externlib/op_timing2.cpp b/op2/src/externlib/op_profile.cpp
similarity index 74%
rename from op2/src/externlib/op_timing2.cpp
rename to op2/src/externlib/op_profile.cpp
index 4d4e4a2cc..66a954781 100644
--- a/op2/src/externlib/op_timing2.cpp
+++ b/op2/src/externlib/op_profile.cpp
@@ -1,4 +1,4 @@
-#include <op_timing2.h>
+#include <op_profile.h>
 #include <op_lib_core.h>
 
 #ifdef OPMPI
@@ -26,9 +26,9 @@ namespace nlohmann {
   };
 }
 
-/* ----------------------------------------- op_timing2_clock ----------------------------------------- */
+/* ----------------------------------------- op_profile_clock ----------------------------------------- */
 
-void op_timing2_clock::submit(op_timing2_clock::clock::duration duration) {
+void op_profile_clock::submit(op_profile_clock::clock::duration duration) {
   ++n;
 
   if (n == 1) {
@@ -45,7 +45,7 @@ void op_timing2_clock::submit(op_timing2_clock::clock::duration duration) {
   if (duration > max) max = duration;
 }
 
-op_timing2_clock& op_timing2_clock::operator+=(const op_timing2_clock& other) {
+op_profile_clock& op_profile_clock::operator+=(const op_profile_clock& other) {
   n += other.n;
 
   total += other.total;
@@ -56,7 +56,7 @@ op_timing2_clock& op_timing2_clock::operator+=(const op_timing2_clock& other) {
   return *this;
 }
 
-std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned width) { 
+std::string format_duration(const op_profile_clock::clock::duration& d, unsigned width) { 
     const char *unit[4] = {"s", "ms", "us", "ns"};
     const int unit_width[4] {1, 2, 2, 2};
 
@@ -96,8 +96,8 @@ std::string format_duration(const op_timing2_clock::clock::duration& d, unsigned
     return std::string(output);
 }
 
-std::string to_string(const op_timing2_clock& clock,
-    const std::optional<std::reference_wrapper<const op_timing2_clock>> parent) {
+std::string to_string(const op_profile_clock& clock,
+    const std::optional<std::reference_wrapper<const op_profile_clock>> parent) {
   std::ostringstream oss;
   oss << "*" << clock.n << " total: " << format_duration(clock.total);
 
@@ -120,7 +120,7 @@ std::string to_string(const op_timing2_clock& clock,
   return oss.str();
 }
 
-void to_json(json& j, const op_timing2_clock& clock) {
+void to_json(json& j, const op_profile_clock& clock) {
   j = json{
     {"n", clock.n},
     {"total", clock.total},
@@ -129,16 +129,16 @@ void to_json(json& j, const op_timing2_clock& clock) {
   };
 }
 
-void from_json(const json& j, op_timing2_clock& clock) {
+void from_json(const json& j, op_profile_clock& clock) {
   j.at("n").get_to(clock.n);
   j.at("total").get_to(clock.total);
   j.at("min").get_to(clock.min);
   j.at("max").get_to(clock.max);
 }
 
-/* ----------------------------------------- op_timing2_node ----------------------------------------- */
+/* ----------------------------------------- op_profile_node ----------------------------------------- */
 
-bool op_timing2_node::has_child(std::string_view name, std::optional<op_timing2_node_type> child_type) {
+bool op_profile_node::has_child(std::string_view name, std::optional<op_profile_node_type> child_type) {
   for (auto& child: children) {
     if (child.name == name) {
       if (child_type.has_value()) assert(child.type == *child_type);
@@ -149,7 +149,7 @@ bool op_timing2_node::has_child(std::string_view name, std::optional<op_timing2_
   return false;
 }
 
-op_timing2_node& op_timing2_node::get_child(std::string_view name, std::optional<op_timing2_node_type> child_type) {
+op_profile_node& op_profile_node::get_child(std::string_view name, std::optional<op_profile_node_type> child_type) {
   for (auto& child: children) {
     if (child.name == name) {
       if (child_type.has_value()) assert(child.type == *child_type);
@@ -158,14 +158,14 @@ op_timing2_node& op_timing2_node::get_child(std::string_view name, std::optional
   }
 
   // Create the child if we don't have one already, setting the type if it was provided
-  children.push_back(op_timing2_node(name));
+  children.push_back(op_profile_node(name));
   if (child_type.has_value()) children.back().type = *child_type;
 
   return children.back();
 }
 
-op_timing2_node& op_timing2_node::get_child(std::vector<std::string> scope,
-    std::optional<op_timing2_node_type> child_type) {
+op_profile_node& op_profile_node::get_child(std::vector<std::string> scope,
+    std::optional<op_profile_node_type> child_type) {
   assert(scope.size() >= 1);
 
   // Recursively call get_child popping the first element off the scope
@@ -173,7 +173,7 @@ op_timing2_node& op_timing2_node::get_child(std::vector<std::string> scope,
   return get_child(scope[0]).get_child(std::vector<std::string>(scope.begin() + 1, scope.end()), child_type);
 }
 
-op_timing2_node& op_timing2_node::operator+=(const op_timing2_node& other) {
+op_profile_node& op_profile_node::operator+=(const op_profile_node& other) {
   clock += other.clock;
   num_ranks += other.num_ranks;
 
@@ -184,8 +184,8 @@ op_timing2_node& op_timing2_node::operator+=(const op_timing2_node& other) {
   return *this;
 }
 
-void op_timing2_node::output(unsigned indent,
-    const std::optional<std::reference_wrapper<const op_timing2_node>> parent) {
+void op_profile_node::output(unsigned indent,
+    const std::optional<std::reference_wrapper<const op_profile_node>> parent) {
   std::printf("%*s%s %s\n", indent, "", name.c_str(),
       to_string(clock, parent.has_value() ? std::optional(parent->get().clock) : std::nullopt).c_str());
 
@@ -193,7 +193,7 @@ void op_timing2_node::output(unsigned indent,
     child.output(indent + 4, parent.has_value() ? parent : *this);
 }
 
-void to_json(json& j, const op_timing2_node& node) {
+void to_json(json& j, const op_profile_node& node) {
   j = json{
     {"name", node.name},
     {"type", node.type},
@@ -203,7 +203,7 @@ void to_json(json& j, const op_timing2_node& node) {
   };
 }
 
-void from_json(const json& j, op_timing2_node& node) {
+void from_json(const json& j, op_profile_node& node) {
   j.at("name").get_to(node.name);
   j.at("type").get_to(node.type);
   j.at("clock").get_to(node.clock);
@@ -211,23 +211,23 @@ void from_json(const json& j, op_timing2_node& node) {
   j.at("children").get_to(node.children);
 }
 
-/* ----------------------------------------- op_timing2 ----------------------------------------- */
+/* ----------------------------------------- op_profile ----------------------------------------- */
 
-op_timing2& op_timing2::instance() {
-  static auto timing = op_timing2{};
+op_profile& op_profile::instance() {
+  static auto timing = op_profile{};
   return timing;
 }
 
-void op_timing2::set_level(op_timing2_level new_level) {
+void op_profile::set_level(op_profile_level new_level) {
   assert(!started);
   level = new_level;
 }
 
-void op_timing2::start(std::string_view name) {
+void op_profile::start(std::string_view name) {
   assert(!started);
   assert(current_scope.size() == 0);
 
-  char *level_str = getenv("OP_TIMING2_LEVEL");
+  char *level_str = getenv("OP_PROFILE_LEVEL");
   if (level_str != nullptr) {
     int level_int = -1;
 
@@ -235,33 +235,33 @@ void op_timing2::start(std::string_view name) {
       level_int = std::stoi(level_str);
     } catch (...) {};
 
-    if (level_int < 0 || level_int > static_cast<int>(op_timing2_level::kernel_detailed))
-      std::printf("warning: OP_TIMING2_LEVEL set to unsupported value: %s\n", level_str);
+    if (level_int < 0 || level_int > static_cast<int>(op_profile_level::kernel_detailed))
+      std::printf("warning: OP_PROFILE_LEVEL set to unsupported value: %s\n", level_str);
     else
-      level = static_cast<op_timing2_level>(level_int);
+      level = static_cast<op_profile_level>(level_int);
   }
 
-  if (level == op_timing2_level::disabled) return;
+  if (level == op_profile_level::disabled) return;
 
   started = true;
   deviceSync();
 
-  root = op_timing2_node(name);
+  root = op_profile_node(name);
 
   current_scope.push_back(root);
-  current_starts.push_back(op_timing2_clock::clock::now());
+  current_starts.push_back(op_profile_clock::clock::now());
 }
 
-void op_timing2::enter(std::string_view name, bool sync) {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::enter(std::string_view name, bool sync) {
+  if (level == op_profile_level::disabled) return;
 
   assert(started && !finished);
   assert(current_scope.size() > 0);
 
   // Check if we should actually start a timer
   if (extra_depth > 0 ||
-      (level < op_timing2_level::kernel_detailed &&
-       current_scope.back().get().type == op_timing2_node_type::kernel)) {
+      (level < op_profile_level::kernel_detailed &&
+       current_scope.back().get().type == op_profile_node_type::kernel)) {
     extra_depth++;
     return;
   }
@@ -272,16 +272,16 @@ void op_timing2::enter(std::string_view name, bool sync) {
   if (sync) deviceSync();
 
   current_scope.push_back(node);
-  current_starts.push_back(op_timing2_clock::clock::now());
+  current_starts.push_back(op_profile_clock::clock::now());
 }
 
-void op_timing2::enter_kernel(std::string_view name, std::string_view target, std::string_view variant) {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::enter_kernel(std::string_view name, std::string_view target, std::string_view variant) {
+  if (level == op_profile_level::disabled) return;
 
   assert(started && !finished);
   assert(current_scope.size() > 0);
 
-  if (level < op_timing2_level::kernel) {
+  if (level < op_profile_level::kernel) {
     extra_depth++;
     return;
   }
@@ -293,16 +293,16 @@ void op_timing2::enter_kernel(std::string_view name, std::string_view target, st
   deviceSync();
 
   enter(full_name);
-  current_scope.back().get().type = op_timing2_node_type::kernel;
+  current_scope.back().get().type = op_profile_node_type::kernel;
 }
 
-void op_timing2::next(std::string_view name) {
+void op_profile::next(std::string_view name) {
   exit();
   enter(name, false);
 }
 
-void op_timing2::exit(bool sync) {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::exit(bool sync) {
+  if (level == op_profile_level::disabled) return;
 
   assert(started && !finished);
   assert(current_scope.size() > 0);
@@ -315,14 +315,14 @@ void op_timing2::exit(bool sync) {
   if (sync) deviceSync();
 
   auto& node = current_scope.back().get();
-  node.clock.submit(op_timing2_clock::clock::now() - current_starts.back());
+  node.clock.submit(op_profile_clock::clock::now() - current_starts.back());
 
   current_scope.pop_back();
   current_starts.pop_back();
 }
 
-void op_timing2::finish() {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::finish() {
+  if (level == op_profile_level::disabled) return;
 
   assert(started && !finished);
   assert(current_scope.size() > 0);
@@ -334,8 +334,8 @@ void op_timing2::finish() {
   finished = true;
 }
 
-void op_timing2::combine() {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::combine() {
+  if (level == op_profile_level::disabled) return;
 
   assert(finished);
   if (combined) return;
@@ -355,7 +355,7 @@ void op_timing2::combine() {
 
       MPI_Recv(msg.data(), size, MPI_BYTE, rank, 0, OP_MPI_WORLD, MPI_STATUS_IGNORE);
       json other_root_json = json::from_msgpack(msg);
-      auto other_root = other_root_json.template get<op_timing2_node>();
+      auto other_root = other_root_json.template get<op_profile_node>();
 
       root += other_root;
     }
@@ -373,8 +373,8 @@ void op_timing2::combine() {
   combined = true;
 }
 
-void op_timing2::output() {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::output() {
+  if (level == op_profile_level::disabled) return;
 
   assert(finished);
   combine();
@@ -385,13 +385,13 @@ void op_timing2::output() {
 
   print_summary();
 
-  char *json_filename = getenv("OP_TIMING2_JSON_OUTPUT");
+  char *json_filename = getenv("OP_PROFILE_JSON_OUTPUT");
   if (json_filename != NULL)
     output_json(json_filename);
 }
 
-void op_timing2::output_json(std::string_view filename) {
-  if (level == op_timing2_level::disabled) return;
+void op_profile::output_json(std::string_view filename) {
+  if (level == op_profile_level::disabled) return;
 
   assert(finished);
   combine();
@@ -414,7 +414,7 @@ void op_timing2::output_json(std::string_view filename) {
   output << root_json;
 }
 
-void op_timing2::print_summary() {
+void op_profile::print_summary() {
   // Output the non-kernel sections, and simultaneously gather a list of nodes which have
   // immediate kernel children
   std::vector<std::vector<std::string>> nodes_with_kernels;
@@ -442,13 +442,13 @@ void op_timing2::print_summary() {
     print_kernel_summary(path, longest_name);
 }
 
-void op_timing2::print_walk_non_kernel(const op_timing2_node& node,
+void op_profile::print_walk_non_kernel(const op_profile_node& node,
     const std::vector<std::string>& parent_path,
     std::vector<std::vector<std::string>>& nodes_with_kernels,
     unsigned indent) {
   bool has_kernel_child = false;
   for (auto& child: node.children)
-    if (child.type == op_timing2_node_type::kernel) has_kernel_child = true;
+    if (child.type == op_profile_node_type::kernel) has_kernel_child = true;
 
   std::vector<std::string> current_path = parent_path;
   current_path.push_back(node.name);
@@ -459,12 +459,12 @@ void op_timing2::print_walk_non_kernel(const op_timing2_node& node,
   std::printf("%*s%s %s\n", indent, "", node.name.c_str(), to_string(node.clock).c_str());
 
   for (auto& child: node.children) {
-    if (child.type == op_timing2_node_type::kernel) continue;
+    if (child.type == op_profile_node_type::kernel) continue;
     print_walk_non_kernel(child, current_path, nodes_with_kernels, indent + 4);
   }
 }
 
-void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsigned longest_name) {
+void op_profile::print_kernel_summary(const std::vector<std::string>& path, unsigned longest_name) {
   // Print the header, starting with the node path
   int path_len = 0;
   for (size_t i = 0; i < path.size(); ++i) {
@@ -479,7 +479,7 @@ void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsi
 
   // And then the column headers for the table
   std::printf("%*s     num    total      avg      min      max", longest_name + 4 - path_len, "");
-  if (level >= op_timing2_level::kernel_detailed) std::printf("    %%kern");
+  if (level >= op_profile_level::kernel_detailed) std::printf("    %%kern");
   std::printf("\n");
 
   // Fetch the node so we can print its children
@@ -487,9 +487,9 @@ void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsi
   auto& node = scope.size() == 0 ? root : root.get_child(scope);
 
   // Gather all kernel children
-  std::vector<std::reference_wrapper<op_timing2_node>> kernel_nodes;
+  std::vector<std::reference_wrapper<op_profile_node>> kernel_nodes;
   for (auto& child: node.children) {
-    if (child.type != op_timing2_node_type::kernel) continue;
+    if (child.type != op_profile_node_type::kernel) continue;
     kernel_nodes.push_back(child);
   }
 
@@ -509,7 +509,7 @@ void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsi
 
     // Print each kernel child row, with kernel % if level = kernel_detailed
     auto kern_pct = std::string("");
-    if (level >= op_timing2_level::kernel_detailed) {
+    if (level >= op_profile_level::kernel_detailed) {
       auto computation_node = child.get().get_child("Computation");
       auto kernel_node = computation_node.has_child("Kernel") ?
         computation_node.get_child("Kernel") : computation_node;
@@ -536,7 +536,7 @@ void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsi
 
   std::printf("\n");
 
-  if (level < op_timing2_level::kernel_detailed) return;
+  if (level < op_profile_level::kernel_detailed) return;
 
   // Print the full tree for the top detailed_limit kernels
   const auto detailed_limit = 4;
@@ -554,19 +554,19 @@ void op_timing2::print_kernel_summary(const std::vector<std::string>& path, unsi
 
 extern "C" {
 
-void op_timing2_start(const char* name) { op_timing2::instance().start(name); }
+void op_profile_start(const char* name) { op_profile::instance().start(name); }
 
-void op_timing2_enter(const char* name) { op_timing2::instance().enter(name); }
-void op_timing2_enter_kernel(const char* name, const char* target, const char* variant) {
-  op_timing2::instance().enter_kernel(name, target, variant);
+void op_profile_enter(const char* name) { op_profile::instance().enter(name); }
+void op_profile_enter_kernel(const char* name, const char* target, const char* variant) {
+  op_profile::instance().enter_kernel(name, target, variant);
 }
 
-void op_timing2_next(const char* name) { op_timing2::instance().next(name); }
+void op_profile_next(const char* name) { op_profile::instance().next(name); }
 
-void op_timing2_exit() { op_timing2::instance().exit(); }
-void op_timing2_finish() { op_timing2::instance().finish(); }
+void op_profile_exit() { op_profile::instance().exit(); }
+void op_profile_end() { op_profile::instance().finish(); }
 
-void op_timing2_output() { op_timing2::instance().output(); }
-void op_timing2_output_json(const char* filename) { op_timing2::instance().output_json(filename); }
+void op_profile_output() { op_profile::instance().output(); }
+void op_profile_output_json(const char* filename) { op_profile::instance().output_json(filename); }
 
 }
diff --git a/op2/src/fortran/op2_for_declarations.F90 b/op2/src/fortran/op2_for_declarations.F90
index fa72fecf3..42fe17741 100644
--- a/op2/src/fortran/op2_for_declarations.F90
+++ b/op2/src/fortran/op2_for_declarations.F90
@@ -747,39 +747,39 @@ end function isCNullPointer_c
     subroutine op_timing_output () BIND(C,name='op_timing_output')
     end subroutine op_timing_output
 
-    subroutine op_timing2_start_c(name) BIND(C,name='op_timing2_start')
+    subroutine op_profile_start_c(name) BIND(C,name='op_profile_start')
       use ISO_C_BINDING
       character(kind=c_char) :: name(*)
-    end subroutine op_timing2_start_c
+    end subroutine op_profile_start_c
 
-    subroutine op_timing2_enter_c(name) BIND(C,name='op_timing2_enter')
+    subroutine op_profile_enter_c(name) BIND(C,name='op_profile_enter')
       use ISO_C_BINDING
       character(kind=c_char) :: name(*)
-    end subroutine op_timing2_enter_c
+    end subroutine op_profile_enter_c
 
-    subroutine op_timing2_enter_kernel_c(name, target, variant) BIND(C,name='op_timing2_enter_kernel')
+    subroutine op_profile_enter_kernel_c(name, target, variant) BIND(C,name='op_profile_enter_kernel')
       use ISO_C_BINDING
       character(kind=c_char) :: name(*), target(*), variant(*)
-    end subroutine op_timing2_enter_kernel_c
+    end subroutine op_profile_enter_kernel_c
 
-    subroutine op_timing2_next_c(name) BIND(C,name='op_timing2_next')
+    subroutine op_profile_next_c(name) BIND(C,name='op_profile_next')
       use ISO_C_BINDING
       character(kind=c_char) :: name(*)
-    end subroutine op_timing2_next_c
+    end subroutine op_profile_next_c
 
-    subroutine op_timing2_exit() BIND(C,name='op_timing2_exit')
-    end subroutine op_timing2_exit
+    subroutine op_profile_exit() BIND(C,name='op_profile_exit')
+    end subroutine op_profile_exit
 
-    subroutine op_timing2_finish() BIND(C,name='op_timing2_finish')
-    end subroutine op_timing2_finish
+    subroutine op_profile_end() BIND(C,name='op_profile_end')
+    end subroutine op_profile_end
 
-    subroutine op_timing2_output() BIND(C,name='op_timing2_output')
-    end subroutine op_timing2_output
+    subroutine op_profile_output() BIND(C,name='op_profile_output')
+    end subroutine op_profile_output
 
-    subroutine op_timing2_output_json_c(filename) BIND(C,name='op_timing2_output_json')
+    subroutine op_profile_output_json_c(filename) BIND(C,name='op_profile_output_json')
       use ISO_C_BINDING
       character(kind=c_char) :: filename(*)
-    end subroutine op_timing2_output_json_c
+    end subroutine op_profile_output_json_c
 
     subroutine op_print_c (line) BIND(C,name='op_print')
       use ISO_C_BINDING
@@ -2152,53 +2152,53 @@ function get_associated_set_size ( dat )
 
   end function
 
-  subroutine op_timing2_start(name)
+  subroutine op_profile_start(name)
 
     use, intrinsic :: ISO_C_BINDING
     implicit none
 
     character(kind=c_char, len=*) :: name
-    call op_timing2_start_c(name /@/ C_NULL_CHAR)
+    call op_profile_start_c(name /@/ C_NULL_CHAR)
 
   end subroutine
 
-  subroutine op_timing2_enter(name)
+  subroutine op_profile_enter(name)
 
     use, intrinsic :: ISO_C_BINDING
     implicit none
 
     character(kind=c_char, len=*) :: name
-    call op_timing2_enter_c(name /@/ C_NULL_CHAR)
+    call op_profile_enter_c(name /@/ C_NULL_CHAR)
 
   end subroutine
 
-  subroutine op_timing2_enter_kernel(name, target, variant)
+  subroutine op_profile_enter_kernel(name, target, variant)
 
     use, intrinsic :: ISO_C_BINDING
     implicit none
 
     character(kind=c_char, len=*) :: name, target, variant
-    call op_timing2_enter_kernel_c(name /@/ C_NULL_CHAR, target /@/ C_NULL_CHAR, variant /@/ C_NULL_CHAR)
+    call op_profile_enter_kernel_c(name /@/ C_NULL_CHAR, target /@/ C_NULL_CHAR, variant /@/ C_NULL_CHAR)
 
   end subroutine
 
-  subroutine op_timing2_next(name)
+  subroutine op_profile_next(name)
 
     use, intrinsic :: ISO_C_BINDING
     implicit none
 
     character(kind=c_char, len=*) :: name
-    call op_timing2_next_c(name /@/ C_NULL_CHAR)
+    call op_profile_next_c(name /@/ C_NULL_CHAR)
 
   end subroutine
 
-  subroutine op_timing2_output_json(filename)
+  subroutine op_profile_output_json(filename)
 
     use, intrinsic :: ISO_C_BINDING
     implicit none
 
     character(kind=c_char, len=*) :: filename
-    call op_timing2_output_json_c(filename /@/ C_NULL_CHAR)
+    call op_profile_output_json_c(filename /@/ C_NULL_CHAR)
 
   end subroutine
 
diff --git a/op2/src/mpi/op_mpi_part_core.cpp b/op2/src/mpi/op_mpi_part_core.cpp
index b2d06ed35..22d9defb6 100644
--- a/op2/src/mpi/op_mpi_part_core.cpp
+++ b/op2/src/mpi/op_mpi_part_core.cpp
@@ -3515,6 +3515,18 @@ construct_adj_list(op_map primary_map, halo_list exp_list, halo_list imp_list,
   return std::make_tuple(adj, adj_i, adj_cap);
 }
 
+#ifdef DEBUG
+static inline void check_global_index_int32_range(idx_g_t g_index,
+                                                  const op_map primary_map,
+                                                  int my_rank) {
+  if (g_index < (idx_g_t)INT32_MIN || g_index > (idx_g_t)INT32_MAX) {
+    op_printf("Error: global index out of 32-bit integer range for map %s on rank %d (index=%lld)\n",
+              primary_map->name, my_rank, (long long)g_index);
+    MPI_Abort(OP_PART_WORLD, 2);
+  }
+}
+#endif
+
 /*******************************************************************************
  * Setup variables for k-way partitioning
  *******************************************************************************/
@@ -3540,6 +3552,11 @@ setup_part_data(op_map primary_map, int my_rank, int comm_size, idx_g_t **adj,
   for (int i = 0; i < primary_map->to->size; i++) {
     idx_g_t g_index = get_global_index(
         i, my_rank, part_range[primary_map->to->index], comm_size);
+#ifdef DEBUG
+    if constexpr (sizeof(T) == sizeof(int)) {
+      check_global_index_int32_range(g_index, primary_map, my_rank);
+    }
+#endif
     op_sort(adj[i], adj_i[i]);
     adj_i[i] = removeDups(adj[i], adj_i[i]);
 
diff --git a/tests/functional/const/const_tests.cpp b/tests/functional/const/const_tests.cpp
index 0f783e908..a9d1889db 100644
--- a/tests/functional/const/const_tests.cpp
+++ b/tests/functional/const/const_tests.cpp
@@ -1,7 +1,7 @@
 // Not intended to be used with OP_NO_REALLOC flag
 
 #include "op_seq.h"
-#include "op_timing2.h"
+#include "op_profile.h"
 #include <vector>
 
 #define TOL 1e-9
@@ -32,7 +32,7 @@ void consts4(double *dat) {
 int main(int argc, char **argv) {
 
   op_init(argc, argv, 2);
-  op_timing2_start("CppConstTests");
+  op_profile_start("CppConstTests");
 
   constexpr int size = 32;
   op_set set = op_decl_set(size, "my_set");;
@@ -71,8 +71,8 @@ int main(int argc, char **argv) {
     printf("consts4 passed\n");
   }
 
-  op_timing2_finish();
-  op_timing2_output();
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 
diff --git a/tests/functional/const_fortran/const_tests.F90 b/tests/functional/const_fortran/const_tests.F90
index 74a1bdac2..63d0ecf27 100644
--- a/tests/functional/const_fortran/const_tests.F90
+++ b/tests/functional/const_fortran/const_tests.F90
@@ -29,7 +29,7 @@ program const_tests_fortran
   integer :: i, d
 
   call op_init_base(0, 0)
-  call op_timing2_start("FortranConstTests")
+  call op_profile_start("FortranConstTests")
 
   call op_decl_set(size, set, "my_set")
   write(*,*) "set size =", set%setPtr%size
@@ -73,10 +73,10 @@ program const_tests_fortran
   end do
   write(*,*) "consts4 passed"
 
-  call op_timing2_finish()
+  call op_profile_end()
   
   if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
   call op_exit()
 
diff --git a/tests/functional/dat_reductions/reduc_tests.cpp b/tests/functional/dat_reductions/reduc_tests.cpp
index 5a1b1c03e..6621c1f69 100644
--- a/tests/functional/dat_reductions/reduc_tests.cpp
+++ b/tests/functional/dat_reductions/reduc_tests.cpp
@@ -5,7 +5,7 @@
 #endif
 
 #include "op_seq.h"
-#include "op_timing2.h"
+#include "op_profile.h"
 
 #include "../utility.h"
 
@@ -45,7 +45,7 @@ void indirect_dat3_inc(float *n0i, float *n1i, const float *er) {
 int main(int argc, char **argv) {
 
   op_init(argc, argv, 2);
-  op_timing2_start("CppReductionTests");
+  op_profile_start("CppReductionTests");
 
   int my_rank = 0;
   int comm_size = 1;
@@ -192,8 +192,8 @@ int main(int argc, char **argv) {
     printf("direct_dat4_inc passed [rank %d]\n", my_rank);
   }
 
-  op_timing2_finish();
-  op_timing2_output();
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 
diff --git a/tests/functional/dat_reductions_fortran/reduc_tests.F90 b/tests/functional/dat_reductions_fortran/reduc_tests.F90
index 7b1a83172..28f6d5f41 100644
--- a/tests/functional/dat_reductions_fortran/reduc_tests.F90
+++ b/tests/functional/dat_reductions_fortran/reduc_tests.F90
@@ -45,7 +45,7 @@ program reduc_tests_fortran
   real(4), dimension(:), allocatable :: expected
 
   call op_init_base(0, 0)
-  call op_timing2_start("FortranReductionTests")
+  call op_profile_start("FortranReductionTests")
 
   call get_rank_and_size(my_rank, comm_size)
 
@@ -187,10 +187,10 @@ program reduc_tests_fortran
 
  deallocate(fetched)
 
-  call op_timing2_finish()
+  call op_profile_end()
   
   if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
   call op_exit()
 
diff --git a/tests/functional/gbl/gbl_tests.cpp b/tests/functional/gbl/gbl_tests.cpp
index 3054683ba..a43573246 100644
--- a/tests/functional/gbl/gbl_tests.cpp
+++ b/tests/functional/gbl/gbl_tests.cpp
@@ -5,7 +5,7 @@
 #endif
 
 #include "op_seq.h"
-#include "op_timing2.h"
+#include "op_profile.h"
 
 #include "../utility.h"
 
@@ -57,7 +57,7 @@ void max5(const double *dat, double *g) {
 int main(int argc, char **argv) {
 
   op_init(argc, argv, 2);
-  op_timing2_start("CppGblArgTests");
+  op_profile_start("CppGblArgTests");
 
   int my_rank = 0;
   int comm_size = 1;
@@ -219,8 +219,8 @@ int main(int argc, char **argv) {
     printf("max5 passed\n");
   }
 
-  op_timing2_finish();
-  op_timing2_output();
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 
diff --git a/tests/functional/gbl_fortran/gbl_tests.F90 b/tests/functional/gbl_fortran/gbl_tests.F90
index b1bd23fa1..88f2eb177 100644
--- a/tests/functional/gbl_fortran/gbl_tests.F90
+++ b/tests/functional/gbl_fortran/gbl_tests.F90
@@ -49,7 +49,7 @@ program gbl_tests_fortran
 #endif
 
   call op_init_base(0, 0)
-  call op_timing2_start("FortranGblArgTests")
+  call op_profile_start("FortranGblArgTests")
 
   call get_rank_and_size(my_rank, comm_size)
 
@@ -182,10 +182,10 @@ program gbl_tests_fortran
   end do
   write(*,*) "max5 passed"
 
-  call op_timing2_finish()
+  call op_profile_end()
   
   if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
   call op_exit()
 
diff --git a/tests/functional/idx/idx_tests.cpp b/tests/functional/idx/idx_tests.cpp
index 7ef12a24b..7caedb069 100644
--- a/tests/functional/idx/idx_tests.cpp
+++ b/tests/functional/idx/idx_tests.cpp
@@ -5,6 +5,7 @@
 #endif
 
 #include "op_seq.h"
+#include "op_profile.h"
 
 #include "../utility.h"
 
@@ -43,6 +44,7 @@ void write_mixed_idx(double *dat, const int *direct_idx, const int *idx0,
 int main(int argc, char **argv) {
 
   op_init(argc, argv, 2);
+  op_profile_start("CppIdxTests");
 
   int my_rank = 0;
   int comm_size = 1;
@@ -157,6 +159,9 @@ int main(int argc, char **argv) {
     printf("mixed direct and indirect idx passed [rank %d]\n", my_rank);
   }
 
+  op_profile_end();
+  op_profile_output();
+
   op_exit();
 
   return 0;
diff --git a/tests/functional/idx_fortran/idx_tests.F90 b/tests/functional/idx_fortran/idx_tests.F90
index d1e1bf557..34cd023a8 100644
--- a/tests/functional/idx_fortran/idx_tests.F90
+++ b/tests/functional/idx_fortran/idx_tests.F90
@@ -42,7 +42,7 @@ program idx_tests_fortran
   real(8) :: expected
 
   call op_init_base(0, 0)
-  call op_timing2_start("FortranIdxTests")
+  call op_profile_start("FortranIdxTests")
 
   call get_rank_and_size(my_rank, comm_size)
 
@@ -157,10 +157,10 @@ program idx_tests_fortran
 
   deallocate(fetched)
 
-  call op_timing2_finish()
+  call op_profile_end()
   
   if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
 
   call op_exit()
 
diff --git a/tests/functional/strides/stride_tests.cpp b/tests/functional/strides/stride_tests.cpp
index 28ede30f6..ab1e92f4c 100644
--- a/tests/functional/strides/stride_tests.cpp
+++ b/tests/functional/strides/stride_tests.cpp
@@ -5,7 +5,7 @@
 #endif
 
 #include "op_seq.h"
-#include "op_timing2.h"
+#include "op_profile.h"
 
 #include "../utility.h"
 
@@ -41,7 +41,7 @@ void write5_within_kernel(double *dat0, double *dat1, const double *read) {
 int main(int argc, char **argv) {
 
   op_init(argc, argv, 2);
-  op_timing2_start("CppStrideTests");
+  op_profile_start("CppStrideTests");
 
   int my_rank = 0;
   int comm_size = 1;
@@ -112,8 +112,8 @@ int main(int argc, char **argv) {
     printf("write5_within_kernel passed\n");
   }
 
-  op_timing2_finish();
-  op_timing2_output();
+  op_profile_end();
+  op_profile_output();
 
   op_exit();
 
diff --git a/tests/functional/strides_fortran/stride_tests.F90 b/tests/functional/strides_fortran/stride_tests.F90
index 4e191bfb1..d63cbfd33 100644
--- a/tests/functional/strides_fortran/stride_tests.F90
+++ b/tests/functional/strides_fortran/stride_tests.F90
@@ -40,7 +40,7 @@ program stride_tests_fortran
   dim2 = 5
 
   call op_init_base(0, 0)
-  call op_timing2_start("FortranStrideTests")
+  call op_profile_start("FortranStrideTests")
 
   call get_rank_and_size(my_rank, comm_size)
 
@@ -106,10 +106,10 @@ program stride_tests_fortran
   end do
   write(*,*) "write5_within_kernel passed"
 
-  call op_timing2_finish()
+  call op_profile_end()
   
   if (op_is_root() == 1) print *
-    call op_timing2_output()
+    call op_profile_output()
   
   call op_exit()
 
diff --git a/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja
index e68105789..fe77087b5 100644
--- a/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja
+++ b/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja
@@ -340,8 +340,11 @@ op_cuda_{{lh.name}}<<<num_blocks, block_size
 
     {% elif config.atomics %}
     for (int round = 0; round < {{"3" if lh.args|gbl|reduction|length > 0 else "2"}}; ++round ) {
-        if (round == 1)
+        if (round == 1) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2);
+            op_profile_next("Computation");
+        }
 
         {% if lh.args|gbl|reduction|length > 0 %}
         int start = round == 0 ? 0 : (round == 1 ? set->core_size : set->size);
@@ -366,8 +369,11 @@ op_cuda_{{lh.name}}<<<num_blocks, block_size
     }
     {% else %}
     for (int col = 0; col < plan->ncolors; ++col) {
-        if (col == plan->ncolors_core)
+        if (col == plan->ncolors_core) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2);
+            op_profile_next("Computation");
+        }
 
         int start = plan->col_offsets[0][col];
         int end = plan->col_offsets[0][col + 1];
@@ -408,6 +414,9 @@ op_cuda_{{lh.name}}<<<num_blocks, block_size
     }
 
     {% endfor %}
+    {% if lh.args|gbl|reduction|length > 0 %}
+    op_profile_next("MPI Reduce");
+
     {% for arg in lh.args|gbl|reduction %}
         {% call opt_if(arg) %}
     arg{{arg.id}}.data = (char *)arg{{arg.id}}_host_data;
@@ -415,8 +424,12 @@ op_cuda_{{lh.name}}<<<num_blocks, block_size
         {% endcall %}
 
     {% endfor %}
+    {% endif %}
+    op_profile_exit();
+
     op_mpi_set_dirtybit_cuda(num_args_expanded, args_expanded);
     cutilSafeCall(cudaDeviceSynchronize());
+    op_profile_exit();
 
 {{super()}}
 
diff --git a/translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja
index 41de734d8..d4077fbcb 100644
--- a/translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja
+++ b/translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja
@@ -16,6 +16,7 @@ __constant__ {{const.typ}} {{const.ptr}}_d{% if const.dim > 1 %}[{{const.dim}}]{
 {{super()}}
 #include "op_cuda_rt_support.h"
 #include "op_cuda_reduction.h"
+#include <op_profile.h>
 {% endblock %}
 
 {% block const_decl_func %}
diff --git a/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja
index b633a452c..c7aee237f 100644
--- a/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja
+++ b/translator-v2/resources/templates/cpp/hip/loop_host.hpp.jinja
@@ -338,8 +338,11 @@ op_hip_{{lh.name}}<<<num_blocks, block_size
 
     {% elif config.atomics %}
     for (int round = 0; round < {{"3" if lh.args|gbl|reduction|length > 0 else "2"}}; ++round ) {
-        if (round == 1)
+        if (round == 1) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2);
+            op_profile_next("Computation");
+        }
 
         {% if lh.args|gbl|reduction|length > 0 %}
         int start = round == 0 ? 0 : (round == 1 ? set->core_size : set->size);
@@ -364,8 +367,11 @@ op_hip_{{lh.name}}<<<num_blocks, block_size
     }
     {% else %}
     for (int col = 0; col < plan->ncolors; ++col) {
-        if (col == plan->ncolors_core)
+        if (col == plan->ncolors_core) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(num_args_expanded, args_expanded, 2);
+            op_profile_next("Computation");
+        }
 
         int start = plan->col_offsets[0][col];
         int end = plan->col_offsets[0][col + 1];
@@ -406,6 +412,9 @@ op_hip_{{lh.name}}<<<num_blocks, block_size
     }
 
     {% endfor %}
+    {% if lh.args|gbl|reduction|length > 0 %}
+    op_profile_next("MPI Reduce");
+
     {% for arg in lh.args|gbl|reduction %}
         {% call opt_if(arg) %}
     arg{{arg.id}}.data = (char *)arg{{arg.id}}_host_data;
@@ -413,8 +422,12 @@ op_hip_{{lh.name}}<<<num_blocks, block_size
         {% endcall %}
 
     {% endfor %}
+    {% endif %}
+    op_profile_exit();
+
     op_mpi_set_dirtybit_cuda(num_args_expanded, args_expanded);
     cutilSafeCall(hipDeviceSynchronize());
+    op_profile_exit();
 
 {{super()}}
 
diff --git a/translator-v2/resources/templates/cpp/jit_cuda/loop_host.h.jinja b/translator-v2/resources/templates/cpp/jit_cuda/loop_host.h.jinja
index 6ea9bd37e..f8b2e8234 100644
--- a/translator-v2/resources/templates/cpp/jit_cuda/loop_host.h.jinja
+++ b/translator-v2/resources/templates/cpp/jit_cuda/loop_host.h.jinja
@@ -392,10 +392,10 @@ void op_par_loop_{{lh.name}}(
     args[{{loop.index0}}] = arg{{arg.id}};
 {% endfor %}
 
-    // op_timing2_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}");
-    // op_timing2_enter("Init");
+    op_profile_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}");
+    op_profile_enter("Init");
 
-    // op_timing2_enter("Kernel Info Setup");
+    op_profile_enter("Kernel Info Setup");
 
     static bool first_invocation = true;
     static op::f2c::KernelInfo info("op2_k_{{lh.name}}{{variant}}_wrapper",
@@ -433,7 +433,7 @@ void op_par_loop_{{lh.name}}(
     {%- endfor -%}
     };
 
-    // op_timing2_enter("Plan");
+    op_profile_enter("Plan");
 
 #ifdef OP_PART_SIZE_{{kernel_idx}}
     int part_size = OP_PART_SIZE_{{kernel_idx}};
@@ -465,7 +465,7 @@ void op_par_loop_{{lh.name}}(
     }
 
     max_blocks = std::min(max_blocks, block_limit);
-    // op_timing2_exit();
+    op_profile_exit();
 {% endif %}
 
     if (first_invocation) {
@@ -491,12 +491,12 @@ void op_par_loop_{{lh.name}}(
         first_invocation = false;
     }
 
-    // op_timing2_next("MPI Exchanges");
+    op_profile_next("MPI Exchanges");
     int n_exec = op_mpi_halo_exchanges_grouped(set, n_args, args, 2);
 
     if (n_exec == 0) {
-        // op_timing2_exit();
-        // op_timing2_exit();
+        op_profile_exit();
+        op_profile_exit();
 
         op_mpi_wait_all_grouped(n_args, args, 2);
 
@@ -505,7 +505,7 @@ void op_par_loop_{{lh.name}}(
 {% endfor %}
 
         op_mpi_set_dirtybit_cuda(n_args, args);
-        // op_timing2_exit();
+        op_profile_exit();
         return;
     }
 
@@ -523,9 +523,9 @@ void op_par_loop_{{lh.name}}(
     static {{arg.typ.c()}}* gbl{{arg.id}}_ref_d = nullptr;
 {% endfor %}
 
-    // op_timing2_next("Get Kernel");
+    op_profile_next("Get Kernel");
     auto *kernel_inst = info.get_kernel();
-    // op_timing2_exit();
+    op_profile_exit();
 
 {% if lh.args|opt|length > 0 %}
     unsigned opt_flags = 0;
@@ -534,7 +534,7 @@ void op_par_loop_{{lh.name}}(
     {% endfor %}
 {% endif %}
 
-    // op_timing2_enter("Prepare GBLs");
+    op_profile_enter("Prepare GBLs");
     prepareDeviceGbls(args, n_args, block_size * max_blocks);
     bool exit_sync = false;
 
@@ -542,7 +542,7 @@ void op_par_loop_{{lh.name}}(
     arg{{arg.id}} = args[{{loop.index0}}];
 {% endfor %}
 
-    // op_timing2_next("Update GBL Refs");
+    op_profile_next("Update GBL Refs");
 {% for arg in args_gbl_per_thread|select2("min", "max", "work") %}
     if (gbl{{arg.id}}_ref_d == nullptr{{" && arg%s.opt == 1" % arg.id if arg is opt}}) {
         CUDA_SAFE_CALL({{api_prefix}}Malloc(&gbl{{arg.id}}_ref_d, {{gbl_dim(arg)}} * sizeof({{arg.typ.c()}})));
@@ -553,36 +553,36 @@ void op_par_loop_{{lh.name}}(
 {% endfor %}
 
 {% if args_gbl_per_thread|length > 0 %}
-    // op_timing2_next("Init GBLs");
+    op_profile_next("Init GBLs");
 
     int stride_gbl = block_size * max_blocks;
     {{init_gbls()|indent}}
 {% endif %}
 
-    // op_timing2_exit();
-    // op_timing2_next("Computation");
+    op_profile_exit();
+    op_profile_next("Computation");
 
 {% if lh is direct %}
     int start = 0;
     int end = set->size;
 
-    // op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     int size = f2c::round32(set->size);
     {{kernel_call()|indent}}
 
-    // op_timing2_next("Process GBLs");
+    op_profile_next("Process GBLs");
     {{process_gbls()|indent}}
 
-    // op_timing2_exit();
+    op_profile_exit();
 {% elif config.atomics %}
-    // op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     for (int round = 1; round < sections.size(); ++round) {
         if (round == 2) {
-            // op_timing2_next("MPI Wait");
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(n_args, args, 2);
-            // op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
 
         int start = sections[round - 1];
@@ -598,22 +598,22 @@ void op_par_loop_{{lh.name}}(
 
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
         if (round == 2) {
-            // op_timing2_next("Process GBLs");
+            op_profile_next("Process GBLs");
             {{process_gbls()|indent(12)}}
-            // op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
     {% endif %}
     }
 
-    // op_timing2_exit();
+    op_profile_exit();
 {% else %}
-    // op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     for (int col = 0; col < plan->ncolors; ++col) {
         if (col == plan->ncolors_core) {
-            // op_timing2_next("MPI Wait");
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(n_args, args, 2);
-            // op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
 
         int start = plan->col_offsets[0][col];
@@ -627,19 +627,19 @@ void op_par_loop_{{lh.name}}(
 
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
         if (col == plan->ncolors_owned - 1) {
-            // op_timing2_next("Process GBLs");
+            op_profile_next("Process GBLs");
             {{process_gbls()|indent(12)}}
-            // op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
     {% endif %}
     }
 
-    // op_timing2_exit();
+    op_profile_exit();
 {% endif %}
 
-    // op_timing2_exit();
+    op_profile_exit();
 
-    // op_timing2_enter("Finalise");
+    op_profile_enter("Finalise");
 {% for arg in lh.args|gbl|reduction %}
     op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data);
 {% endfor %}
@@ -647,6 +647,6 @@ void op_par_loop_{{lh.name}}(
     op_mpi_set_dirtybit_cuda(n_args, args);
     if (exit_sync) CUDA_SAFE_CALL({{api_prefix}}StreamSynchronize(0));
 
-    // op_timing2_exit();
-    // op_timing2_exit();
+    op_profile_exit();
+    op_profile_exit();
 }
diff --git a/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja
index 174e7a470..a962fe886 100644
--- a/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja
+++ b/translator-v2/resources/templates/cpp/jit_cuda/master_kernel.cu.jinja
@@ -31,7 +31,7 @@ INCTXT(OP_F2C_PRELUDE, "op_f2c_prelude.h");
 #include <op_f2c_helpers.h>
 
 #include <op_lib_cpp.h>
-#include <op_timing2.h>
+#include <op_profile.h>
 
 #include <cstdint>
 #include <cmath>
diff --git a/translator-v2/resources/templates/cpp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/loop_host.hpp.jinja
index 94c64aaed..f4aa14906 100644
--- a/translator-v2/resources/templates/cpp/loop_host.hpp.jinja
+++ b/translator-v2/resources/templates/cpp/loop_host.hpp.jinja
@@ -21,6 +21,7 @@ int dats_indirect[{{lh.args_expanded|length}}] = {
     {% endif %}
 {% endmacro %}
 {% block prologue %}
+#include <op_profile.h>
 {% endblock %}
 {% block kernel %}
 namespace op2_k{{kernel_idx}} {
@@ -53,11 +54,16 @@ void op_par_loop_{{lh.name}}(
 
     op_timers_core(&cpu_start, &wall_start);
 
+    op_profile_enter_kernel(name, "{{config.target}}", "{{"Direct" if lh is direct else "Indirect"}}");
+    op_profile_enter("MPI Exchanges");
+
     if (OP_diags > 2)
         printf(" kernel routine ({{"direct" if lh is direct else "indirect"}}): {{lh.name}}\n");
 
     int set_size = op_mpi_halo_exchanges{{"_grouped" if config.grouped-}}
         (set, num_args_expanded, args_expanded{{(", %d" % config.device) if config.grouped}});
+
+    op_profile_next("Computation");
 {% endblock %}
 
 {% block host_loop required %}
diff --git a/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja
index 68438082f..1f2b86e9c 100644
--- a/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja
+++ b/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja
@@ -246,8 +246,11 @@ void {{lh.name}}_wrapper(
     {% else %}
     int block_offset = 0;
     for (int col = 0; col < plan->ncolors; ++col) {
-        if (col == plan->ncolors_core)
+        if (col == plan->ncolors_core) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all(num_args_expanded, args_expanded);
+            op_profile_next("Computation");
+        }
 
         int num_blocks = plan->ncolblk[col];
 
@@ -312,6 +315,7 @@ void {{lh.name}}_wrapper(
 
 {% block host_epilogue %}
     {% if lh is indirect -%} {# TODO: is this indirect check necessary? #}
+    op_profile_next("MPI Wait");
     if (set_size == set->core_size)
         op_mpi_wait_all(num_args_expanded, args_expanded);
 
@@ -338,10 +342,17 @@ void {{lh.name}}_wrapper(
             {% endfor %}
 
     {% endif %}
+    {% if lh.args|gbl|reduction|length > 0 %}
+    op_profile_next("MPI Reduce");
+
     {% for arg in lh.args|gbl|reduction %}
     op_mpi_reduce(&arg{{arg.id}}, gbl{{arg.id}});
     {% endfor %}
+    {% endif %}
+    op_profile_exit();
+
     op_mpi_set_dirtybit(num_args_expanded, args_expanded);
+    op_profile_exit();
 
 {{super()}}
 {% endblock %}
diff --git a/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja
index a61fc2f2b..6ce103a0e 100644
--- a/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja
+++ b/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja
@@ -65,6 +65,7 @@ info{{arg.id}}_temp
 {%- endmacro -%}
 
 #include <op_lib_cpp.h>
+#include <op_profile.h>
 
 #include <cstdint>
 #include <cmath>
@@ -89,8 +90,13 @@ void op_par_loop_{{lh.name}}(
     args[{{loop.index0}}] = arg{{arg.id}};
 {% endfor %}
 
+    op_profile_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}");
+
+    op_profile_enter("MPI Exchanges");
     int n_exec = op_mpi_halo_exchanges(set, n_args, args);
 
+    op_profile_next("Computation");
+
 {% for arg in lh.args|gbl|reject("read") if lh is indirect %}
     {{arg.typ.c()}} gbl{{arg.id}}_temp[{{arg_dim(arg)}}];
 {% endfor %}
@@ -108,7 +114,9 @@ void op_par_loop_{{lh.name}}(
     for (int n = 0; n < n_exec; ++n) {
 {% if lh is indirect %}
         if (n == set->core_size) {
+            op_profile_next("MPI Wait");
             op_mpi_wait_all(n_args, args);
+            op_profile_next("Computation");
         }
 
     {% for map in lh.maps %}
@@ -152,14 +160,20 @@ void op_par_loop_{{lh.name}}(
     }
 {% endif %}
 
+    op_profile_next("MPI Wait");
     if (n_exec == 0 || n_exec == set->core_size)
         op_mpi_wait_all(n_args, args);
 
 {% if lh.args|gbl|reduction|length > 0 %}
+    op_profile_next("MPI Reduce");
+
     {% for arg in lh.args|gbl|reduction %}
     op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data);
     {% endfor %}
 {% endif %}
 
+    op_profile_exit();
+
     op_mpi_set_dirtybit(n_args, args);
+    op_profile_exit();
 }
diff --git a/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja b/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja
index 6925b4ae6..0b34a3674 100644
--- a/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja
+++ b/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja
@@ -319,6 +319,8 @@ void op2_k_{{lh.name}}{{variant}}_wrapper(
 const char op2_k_{{lh.name}}{{variant}}_src[] = R"_op2_k(
 namespace op2_m_{{lh.name}}{{variant}} {
 
+using int64_t = long long int;
+
 {{kernel_func}}}
 
 {{kernel_wrapper(jit=true)}}
@@ -374,10 +376,10 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     int n_args = {{lh.args|length}};
     op_arg args[{{lh.args|length}}];
 
-    op_timing2_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}");
-    op_timing2_enter("Init");
+    op_profile_enter_kernel("{{lh.name}}", "c_CUDA", "{{variant_str()}}");
+    op_profile_enter("Init");
 
-    op_timing2_enter("Kernel Info Setup");
+    op_profile_enter("Kernel Info Setup");
 
     static bool first_invocation = true;
     static op::f2c::KernelInfo info("op2_k_{{lh.name}}{{variant}}_wrapper",
@@ -409,12 +411,12 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     args[{{loop.index0}}] = arg{{arg.id}};
 {% endfor %}
 
-    op_timing2_next("MPI Exchanges");
+    op_profile_next("MPI Exchanges");
     int n_exec = op_mpi_halo_exchanges_grouped(set, n_args, args, 2);
 
     if (n_exec == 0) {
-        op_timing2_exit();
-        op_timing2_exit();
+        op_profile_exit();
+        op_profile_exit();
 
         op_mpi_wait_all_grouped(n_args, args, 2);
 
@@ -423,7 +425,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
 {% endfor %}
 
         op_mpi_set_dirtybit_cuda(n_args, args);
-        op_timing2_exit();
+        op_profile_exit();
         return;
     }
 
@@ -444,9 +446,9 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     static {{arg.typ.c()}}* gbl{{arg.id}}_ref_d = nullptr;
 {% endfor %}
 
-    op_timing2_next("Get Kernel");
+    op_profile_next("Get Kernel");
     auto *kernel_inst = info.get_kernel();
-    op_timing2_exit();
+    op_profile_exit();
 
 {% if lh is direct %}
     auto [block_limit, block_size] = info.get_launch_config(kernel_inst, set->size);
@@ -479,7 +481,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     {%- endfor -%}
     };
 
-    op_timing2_enter("Plan");
+    op_profile_enter("Plan");
 
 #ifdef OP_PART_SIZE_{{kernel_idx}}
     int part_size = OP_PART_SIZE_{{kernel_idx}};
@@ -511,7 +513,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     }
 
     max_blocks = std::min(max_blocks, block_limit);
-    op_timing2_exit();
+    op_profile_exit();
 {% endif %}
 
 {% if lh.args|opt|length > 0 %}
@@ -521,7 +523,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     {% endfor %}
 {% endif %}
 
-    op_timing2_enter("Prepare GBLs");
+    op_profile_enter("Prepare GBLs");
     prepareDeviceGbls(args, n_args, block_size * max_blocks);
     bool exit_sync = false;
 
@@ -529,7 +531,7 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     arg{{arg.id}} = args[{{loop.index0}}];
 {% endfor %}
 
-    op_timing2_next("Update GBL Refs");
+    op_profile_next("Update GBL Refs");
 {% for arg in args_gbl_per_thread|select2("min", "max", "work") %}
     if (gbl{{arg.id}}_ref_d == nullptr{{" && arg%s.opt == 1" % arg.id if arg is opt}}) {
         CUDA_SAFE_CALL({{api_prefix}}Malloc(&gbl{{arg.id}}_ref_d, {{gbl_dim(arg)}} * sizeof({{arg.typ.c()}})));
@@ -540,36 +542,36 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
 {% endfor %}
 
 {% if args_gbl_per_thread|length > 0 %}
-    op_timing2_next("Init GBLs");
+    op_profile_next("Init GBLs");
 
     int stride_gbl = block_size * max_blocks;
     {{init_gbls()|indent}}
 {% endif %}
 
-    op_timing2_exit();
-    op_timing2_next("Computation");
+    op_profile_exit();
+    op_profile_next("Computation");
 
 {% if lh is direct %}
     int start = 0;
     int end = set->size;
 
-    op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     int size = f2c::round32(set->size);
     {{kernel_call()|indent}}
 
-    op_timing2_next("Process GBLs");
+    op_profile_next("Process GBLs");
     {{process_gbls()|indent}}
 
-    op_timing2_exit();
+    op_profile_exit();
 {% elif config.atomics %}
-    op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     for (int round = 1; round < sections.size(); ++round) {
         if (round == 2) {
-            op_timing2_next("MPI Wait");
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(n_args, args, 2);
-            op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
 
         int start = sections[round - 1];
@@ -585,22 +587,22 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
 
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
         if (round == 2) {
-            op_timing2_next("Process GBLs");
+            op_profile_next("Process GBLs");
             {{process_gbls()|indent(12)}}
-            op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
     {% endif %}
     }
 
-    op_timing2_exit();
+    op_profile_exit();
 {% else %}
-    op_timing2_enter("Kernel");
+    op_profile_enter("Kernel");
 
     for (int col = 0; col < plan->ncolors; ++col) {
         if (col == plan->ncolors_core) {
-            op_timing2_next("MPI Wait");
+            op_profile_next("MPI Wait");
             op_mpi_wait_all_grouped(n_args, args, 2);
-            op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
 
         int start = plan->col_offsets[0][col];
@@ -614,19 +616,19 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
 
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
         if (col == plan->ncolors_owned - 1) {
-            op_timing2_next("Process GBLs");
+            op_profile_next("Process GBLs");
             {{process_gbls()|indent(12)}}
-            op_timing2_next("Kernel");
+            op_profile_next("Kernel");
         }
     {% endif %}
     }
 
-    op_timing2_exit();
+    op_profile_exit();
 {% endif %}
 
-    op_timing2_exit();
+    op_profile_exit();
 
-    op_timing2_enter("Finalise");
+    op_profile_enter("Finalise");
 {% for arg in lh.args|gbl|reduction %}
     op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data);
 {% endfor %}
@@ -634,6 +636,6 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     op_mpi_set_dirtybit_cuda(n_args, args);
     if (exit_sync) CUDA_SAFE_CALL({{api_prefix}}StreamSynchronize(0));
 
-    op_timing2_exit();
-    op_timing2_exit();
+    op_profile_exit();
+    op_profile_exit();
 }
diff --git a/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja
index e71dbcdcc..9b2cbdbb4 100644
--- a/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja
+++ b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja
@@ -52,7 +52,7 @@ INCTXT(OP_F2C_PRELUDE, "op_f2c_prelude.h");
 #include <op_f2c_helpers.h>
 
 #include <op_lib_cpp.h>
-#include <op_timing2.h>
+#include <op_profile.h>
 
 #include <cstdint>
 #include <cmath>
diff --git a/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja b/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja
index 62bb3ce7f..9dc01581d 100644
--- a/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja
+++ b/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja
@@ -83,7 +83,7 @@ info{{arg.id}}_temp
 
 #include <op_f2c_prelude.h>
 #include <op_lib_cpp.h>
-#include <op_timing2.h>
+#include <op_profile.h>
 
 #include <cstdint>
 #include <cmath>
@@ -109,12 +109,12 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     args[{{loop.index0}}] = arg{{arg.id}};
 {% endfor %}
 
-    op_timing2_enter_kernel("{{lh.name}}", "c_seq", "{{variant_str()}}");
+    op_profile_enter_kernel("{{lh.name}}", "c_seq", "{{variant_str()}}");
 
-    op_timing2_enter("MPI Exchanges");
+    op_profile_enter("MPI Exchanges");
     int n_exec = op_mpi_halo_exchanges(set, n_args, args);
 
-    op_timing2_next("Computation");
+    op_profile_next("Computation");
 
 {% for arg in lh.args|gbl|reject("read") if lh is indirect %}
     {{arg.typ.c()}} gbl{{arg.id}}_temp[{{arg_dim(arg)}}];
@@ -138,9 +138,9 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     for (int n = 0; n < n_exec; ++n) {
 {% if lh is indirect %}
         if (n == set->core_size) {
-            op_timing2_next("MPI Wait");
+            op_profile_next("MPI Wait");
             op_mpi_wait_all(n_args, args);
-            op_timing2_next("Computation");
+            op_profile_next("Computation");
         }
 
     {% for map in lh.maps %}
@@ -184,19 +184,19 @@ extern "C" void op2_k_{{lh.name}}{{variant}}_c(
     }
 {% endif %}
 
-    op_timing2_next("MPI Wait");
+    op_profile_next("MPI Wait");
     if (n_exec == 0 || n_exec == set->core_size)
         op_mpi_wait_all(n_args, args);
 
 {% if lh.args|gbl|reduction|length > 0 %}
-    op_timing2_next("MPI Reduce");
+    op_profile_next("MPI Reduce");
 
     {% for arg in lh.args|gbl|reduction %}
     op_mpi_reduce(&arg{{arg.id}}, ({{arg.typ.c()}} *)arg{{arg.id}}.data);
     {% endfor %}
 {% endif %}
-    op_timing2_exit();
+    op_profile_exit();
 
     op_mpi_set_dirtybit(n_args, args);
-    op_timing2_exit();
+    op_profile_exit();
 }
diff --git a/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja b/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja
index a212a027d..0165efde1 100644
--- a/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja
+++ b/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja
@@ -386,15 +386,15 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     args({{loop.index}}) = arg{{arg.id}}
 {% endfor %}
 
-    call op_timing2_enter_kernel("{{lh.name}}", "CUDA", "{{variant_str()}}")
-    call op_timing2_enter("Init")
+    call op_profile_enter_kernel("{{lh.name}}", "CUDA", "{{variant_str()}}")
+    call op_profile_enter("Init")
 
-    call op_timing2_enter("MPI Exchanges")
+    call op_profile_enter("MPI Exchanges")
     n_exec = op_mpi_halo_exchanges_grouped(set%setcptr, size(args), args, 2)
 
     if (n_exec == 0) then
-        call op_timing2_exit()
-        call op_timing2_exit()
+        call op_profile_exit()
+        call op_profile_exit()
 
         call op_mpi_wait_all_grouped(size(args), args, 2)
 {% for arg in lh.args|gbl|reduction %}
@@ -407,15 +407,15 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
             print *, cudaGetErrorString(err)
         end if
 
-        call op_timing2_exit()
+        call op_profile_exit()
         return
     end if
 
-    call op_timing2_next("Update consts")
+    call op_profile_next("Update consts")
 {% for const in lh.consts %}
     call op_update_const_cuda_{{const}}(){{"\n" if loop.last}}
 {% endfor %}
-    call op_timing2_exit()
+    call op_profile_exit()
 
     call setGblIncAtomic(logical({{".true." if config.gbl_inc_atomic else ".false."}}, c_bool))
     block_size = getBlockSize(name // c_null_char, set%setptr%size)
@@ -438,7 +438,7 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     {%- endfor -%}
     /)
 
-    call op_timing2_enter("Plan")
+    call op_profile_enter("Plan")
 
     part_size = getpartitionsize(name // c_null_char, set%setptr%size)
     plan => fortranplancaller( &
@@ -466,12 +466,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
         max_blocks = max(max_blocks, num_blocks)
     end do
 
-    call op_timing2_exit()
+    call op_profile_exit()
 {% endif %}
 
-    call op_timing2_enter("Prepare GBLs")
+    call op_profile_enter("Prepare GBLs")
     call prepareDeviceGbls(args, size(args), block_size * max_blocks)
-    call op_timing2_exit()
+    call op_profile_exit()
 
 {% for arg in lh.args %}
     arg{{arg.id}} = args({{loop.index}})
@@ -524,23 +524,23 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
 
 {% endfor %}
 {% if args_gbl_per_thread|length > 0 %}
-    call op_timing2_enter("Init GBLs")
+    call op_profile_enter("Init GBLs")
     {{init_gbls()|indent}}
 
-    call op_timing2_exit()
+    call op_profile_exit()
 {% endif %}
-    call op_timing2_next("Computation")
+    call op_profile_next("Computation")
 {% if lh is direct %}
     start = 0
     end = set%setptr%size
 
-    call op_timing2_enter("Kernel")
+    call op_profile_enter("Kernel")
     {{kernel_call("set%setptr%size")|indent}}{{"\n" if lh.args|gbl|reject("read")|list|length > 0}}
 
-    call op_timing2_next("Process GBLs")
+    call op_profile_next("Process GBLs")
     {{process_gbls()|indent}}
 
-    call op_timing2_exit()
+    call op_profile_exit()
 {% elif config.atomics %}
     {% if lh.args|gbl|reduction|length == 0 %}
     sections = (/0, set%setptr%core_size, set%setptr%size + set%setptr%exec_size, 0/)
@@ -548,12 +548,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     sections = (/0, set%setptr%core_size, set%setptr%size, set%setptr%size + set%setptr%exec_size/)
     {% endif %}
 
-    call op_timing2_enter("Kernel")
+    call op_profile_enter("Kernel")
     do round = 1, {{"3" if lh.args|gbl|reduction|length > 0 else "2"}}
         if (round == 2) then
-            call op_timing2_next("MPI Wait")
+            call op_profile_next("MPI Wait")
             call op_mpi_wait_all_grouped(size(args), args, 2)
-            call op_timing2_next("Kernel")
+            call op_profile_next("Kernel")
         end if
 
         start = sections(round)
@@ -568,21 +568,21 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
 
         if (round == 2) then
-            call op_timing2_next("Process GBLs")
+            call op_profile_next("Process GBLs")
             {{process_gbls()|indent(12)}}
-            call op_timing2_next("Kernel")
+            call op_profile_next("Kernel")
         end if
     {% endif %}
     end do
 
-    call op_timing2_exit()
+    call op_profile_exit()
 {% else %}
-    call op_timing2_enter("Kernel")
+    call op_profile_enter("Kernel")
     do col = 1, plan%ncolors
         if (col == plan%ncolors_core + 1) then
-            call op_timing2_next("MPI Wait")
+            call op_profile_next("MPI Wait")
             call op_mpi_wait_all_grouped(size(args), args, 2)
-            call op_timing2_next("Kernel")
+            call op_profile_next("Kernel")
         end if
 
         start = plan_color2_offsets(col)
@@ -595,18 +595,18 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     {% if lh.args|gbl|reject("read")|list|length > 0 %}
 
         if (col == plan%ncolors_owned) then
-            call op_timing2_next("Process GBLs")
+            call op_profile_next("Process GBLs")
             {{process_gbls()|indent(12)}}
-            call op_timing2_next("Kernel")
+            call op_profile_next("Kernel")
         end if
     {% endif %}
     end do
 
-    call op_timing2_exit()
+    call op_profile_exit()
 {% endif %}
-    call op_timing2_exit()
+    call op_profile_exit()
 
-    call op_timing2_enter("Finalise")
+    call op_profile_enter("Finalise")
 {% for arg in lh.args|gbl|reduction %}
     call op_mpi_reduce_{{type_c(arg)}}(arg{{arg.id}}, arg{{arg.id}}%data)
 {% endfor %}
@@ -618,8 +618,8 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
         print *, cudaGetErrorString(err)
     end if
 
-    call op_timing2_exit()
-    call op_timing2_exit()
+    call op_profile_exit()
+    call op_profile_exit()
 end subroutine
 
 end module
diff --git a/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja b/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja
index 0b9d7d786..2acc85dbc 100644
--- a/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja
+++ b/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja
@@ -109,9 +109,6 @@ subroutine {{lh.kernel}}_wrapper2( &
     {{arg.typ}}, dimension({{arg.dim}}) :: info{{arg.id}}{{"\n" if loop.last}}
 {% endfor %}
     integer(4) :: start, end
-{% for dat in lh.dats if dat.dim is none %}
-    integer(4) :: dat{{dat.arg_id}}_dim
-{% endfor %}
 
     ! locals
     integer(4) :: n
@@ -461,8 +458,6 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
 {% for arg in lh.args|info %}
     {{arg.typ}}, pointer, dimension(:) :: info{{arg.id}}{{"\n" if loop.last}}
 {% endfor %}
-    real(8) :: start_time, end_time
-    real(4) :: transfer
 {% for dat in lh.dats if dat.dim is none %}
 
     dat{{dat.arg_id}}_dim = arg{{dat.arg_id}}%dim
@@ -480,9 +475,13 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     {%- endfor -%}
     /)
 
-    call op_timers_core(start_time)
+    call op_profile_enter_kernel("{{lh.name}}", "openmp", "{{"Direct" if lh is direct else "Indirect"}}")
+
+    call op_profile_enter("MPI Exchanges")
     set_size = op_mpi_halo_exchanges(set%setcptr, size(args), args)
 
+    call op_profile_next("Computation")
+
 {% for dat in lh.dats %}
     call c_f_pointer(arg{{dat.arg_id}}%data, dat{{dat.id}}, (/{{dat_dim_w(dat)}}, getsetsizefromoparg(arg{{dat.arg_id}})/))
         {{-"\n" if loop.last}}
@@ -520,6 +519,7 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
         dats_indirect &
     )
 
+    call op_profile_next("MPI Wait")
     if ((set_size .eq. 0) .or. (set_size .eq. set%setptr%core_size)) then
         call op_mpi_wait_all(size(args), args)
     end if
@@ -527,13 +527,10 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
 {% for arg in lh.args|gbl|reduction %}
     call op_mpi_reduce_{{arg.typ.__repr__()}}(arg{{arg.id}}, arg{{arg.id}}%data){{"\n" if loop.last}}
 {% endfor %}
-    call op_mpi_set_dirtybit(size(args), args)
-    call op_timers_core(end_time)
+    call op_profile_exit()
 
-    ! todo: review kernel transfer calculation
-    transfer = 0.0
-
-    call setkerneltime({{kernel_idx}}, name // c_null_char, end_time - start_time, transfer, 0.0, 1)
+    call op_mpi_set_dirtybit(size(args), args)
+    call op_profile_exit()
 end subroutine
 
 end module
diff --git a/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja
index 7e4aadfca..c7fbea805 100644
--- a/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja
+++ b/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja
@@ -128,9 +128,9 @@ subroutine op2_k_{{lh.name}}_wr( &
     do n = 1, n_exec
     {% if lh is indirect %}
         if (n == set%setptr%core_size + 1) then
-            call op_timing2_next("MPI Wait")
+            call op_profile_next("MPI Wait")
             call op_mpi_wait_all(size(args), args)
-            call op_timing2_next("Computation")
+            call op_profile_next("Computation")
         end if
 
     {% endif %}
@@ -204,12 +204,12 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
     args({{arg.id + 1}}) = arg{{arg.id}}
 {% endfor %}
 
-    call op_timing2_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}")
+    call op_profile_enter_kernel("{{lh.name}}", "seq", "{{variant_str()}}")
 
-    call op_timing2_enter("MPI Exchanges")
+    call op_profile_enter("MPI Exchanges")
     n_exec = op_mpi_halo_exchanges(set%setcptr, size(args), args)
 
-    call op_timing2_next("Computation")
+    call op_profile_next("Computation")
 
 {% for dat in lh.dats %}
     call c_f_pointer(arg{{dat.arg_id}}%data, dat{{dat.id}}, (/{{dat_dim(dat)}}, getsetsizefromoparg(arg{{dat.arg_id}})/))
@@ -246,22 +246,22 @@ subroutine op2_k_{{lh.name}}{{variant}}( &
         args &
     )
 
-    call op_timing2_next("MPI Wait")
+    call op_profile_next("MPI Wait")
     if ((n_exec == 0) .or. (n_exec == set%setptr%core_size)) then
         call op_mpi_wait_all(size(args), args)
     end if
 
 {% if lh.args|gbl|reduction|length > 0 %}
-    call op_timing2_next("MPI Reduce")
+    call op_profile_next("MPI Reduce")
 
     {% for arg in lh.args|gbl|reduction %}
     call op_mpi_reduce_{{arg.typ.__repr__()}}(arg{{arg.id}}, arg{{arg.id}}%data){{"\n" if loop.last}}
     {% endfor %}
 {% endif %}
-    call op_timing2_exit()
+    call op_profile_exit()
 
     call op_mpi_set_dirtybit(size(args), args)
-    call op_timing2_exit()
+    call op_profile_exit()
 end subroutine
 
 end module