diff --git a/.gitignore b/.gitignore
index 16389f34e..215b56e9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@ CMakeCache.txt
 doc
 
 apps/tensor_times_vector/tensor_times_vector
+
+.cache
+.vscode
+compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6a80d9d1..c9012ca2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,11 +10,13 @@ project(taco
   LANGUAGES C CXX
 )
 option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
+option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF)
 option(PYTHON "Build TACO for python environment" OFF)
-option(OPENMP "Build with OpenMP execution support" OFF)
+option(OPENMP "Build with OpenMP execution support" ON)
 option(COVERAGE "Build with code coverage analysis" OFF)
 set(TACO_FEATURE_CUDA 0)
-set(TACO_FEATURE_OPENMP 0)
+set(TACO_FEATURE_ISPC 0)
+set(TACO_FEATURE_OPENMP 1)
 set(TACO_FEATURE_PYTHON 0)
 if(CUDA)
   message("-- Searching for CUDA Installation")
@@ -22,6 +24,11 @@ if(CUDA)
   add_definitions(-DCUDA_BUILT)
   set(TACO_FEATURE_CUDA 1)
 endif(CUDA)
+if(ISPC)
+  message("-- Searching for ISPC Installation")
+  add_definitions(-DISPC_BUILT)
+  set(TACO_FEATURE_ISPC 1)
+endif(ISPC) 
 if(OPENMP)
   message("-- Will use OpenMP for parallel execution")
   add_definitions(-DUSE_OPENMP)
@@ -88,6 +95,39 @@ if(OPENMP)
   set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}")
 endif(OPENMP)
 
+set(PAPI_DIR "/home/min/a/kadhitha/workspace/my_taco/papi/src/install/")
+
+find_path(PAPI_DIR
+    NAMES include/papi.h
+)
+
+find_library(PAPI_LIBRARIES
+    # Pick the static library first for easier run-time linking.
+    NAMES libpapi.a papi
+    HINTS ${PAPI_DIR}/lib ${HILTIDEPS}/lib
+)
+
+find_path(PAPI_INCLUDE_DIRS
+    NAMES papi.h
+    HINTS ${PAPI_DIR}/include ${HILTIDEPS}/include
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PAPI DEFAULT_MSG
+    PAPI_LIBRARIES
+    PAPI_INCLUDE_DIRS
+)
+
+mark_as_advanced(
+    PAPI_PREFIX_DIRS
+    PAPI_LIBRARIES
+    PAPI_INCLUDE_DIRS
+)
+
+include_directories(${PAPI_INCLUDE_DIRS})
+
+# project (ValgrindExample)
+
 if(COVERAGE)
   find_program(PATH_TO_GCOVR gcovr REQUIRED)
   # add coverage tooling to build flags
@@ -97,7 +137,8 @@ if(COVERAGE)
   message("-- Code coverage analysis (gcovr) enabled")
 endif(COVERAGE)
 
-set(C_CXX_FLAGS "${C_CXX_FLAGS}")
+set(C_CXX_FLAGS "${C_CXX_FLAGS} -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib")
+# set(C_CXX_FLAGS "${C_CXX_FLAGS}")
 set(CMAKE_C_FLAGS "${C_CXX_FLAGS}")
 set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -std=c++14")
 
@@ -110,6 +151,9 @@ set(TACO_INCLUDE_DIR ${TACO_PROJECT_DIR}/include)
 
 enable_testing()
 include_directories(${TACO_INCLUDE_DIR})
+# include_directories("/home/min/a/kadhitha/workspace/my_taco/valgrind")
+# project (ValgrindExample)
+# include (CTest)
 
 set(TACO_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 36eb34f1a..4db5fcdaf 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -17,7 +17,7 @@ class Module {
 public:
   /// Create a module for some target
   Module(Target target=getTargetFromEnvironment())
-    : lib_handle(nullptr), moduleFromUserSource(false), target(target) {
+    : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) {
     setJITLibname();
     setJITTmpdir();
   }
@@ -44,11 +44,16 @@ class Module {
   /// before calling. If there's no function of this name then a nullptr is
   /// returned.
   void* getFuncPtr(std::string name);
+  void* getFuncPtr(std::string& sofile, std::string name);
 
   /// Call a raw function in this module and return the result
+  int callFuncPackedRaw(std::string name, std::string& sofile, void** args);
   int callFuncPackedRaw(std::string name, void** args);
   
   /// Call a raw function in this module and return the result
+  int callFuncPackedRaw(std::string name, std::string& sofile, std::vector<void*> args) {
+    return callFuncPackedRaw(name, sofile, args.data());
+  }
   int callFuncPackedRaw(std::string name, std::vector<void*> args) {
     return callFuncPackedRaw(name, args.data());
   }
@@ -57,6 +62,10 @@ class Module {
   int callFuncPacked(std::string name, void** args) {
     return callFuncPackedRaw("_shim_"+name, args);
   }
+
+  int callFuncPacked(std::string name, std::string& sofile, void** args) {
+    return callFuncPackedRaw("_shim_"+name, sofile,args);
+  }
   
   /// Call a function using the taco_tensor_t interface and return the result
   int callFuncPacked(std::string name, std::vector<void*> args) {
@@ -68,10 +77,12 @@ class Module {
   
 private:
   std::stringstream source;
+  std::stringstream additional_source;
   std::stringstream header;
   std::string libname;
   std::string tmpdir;
   void* lib_handle;
+  void* so_lib_handle;
   std::vector<Stmt> funcs;
   
   // true iff the module was created from user-provided source
diff --git a/include/taco/cuda.h b/include/taco/cuda.h
index aad6b5229..9c4a7aae9 100644
--- a/include/taco/cuda.h
+++ b/include/taco/cuda.h
@@ -9,7 +9,19 @@
   #define CUDA_BUILT false
 #endif
 
+#ifndef ISPC_BUILT
+  #define ISPC_BUILT false
+#endif
+
 namespace taco {
+
+/// Functions used by taco to interface with ISPC
+bool should_use_ISPC_codegen();
+void set_ISPC_codegen_enabled(bool enabled);
+bool is_ISPC_code_stream_enabled();
+void set_ISPC_code_stream_enabled(bool enabled);
+
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 /// Check if should use CUDA codegen
 bool should_use_CUDA_codegen();
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index 7aa2579ad..4d6ec6830 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt);
  */
 IndexStmt reorderLoopsTopologically(IndexStmt stmt);
 
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment,
+  std::string side, int iters);
+
 /**
  * Performs scalar promotion so that reductions are done by accumulating into 
  * scalar temporaries whenever possible.
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index f852f26b1..96dc7d034 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -591,7 +591,7 @@ struct Switch : public StmtNode<Switch> {
   static const IRNodeType _type_info = IRNodeType::Switch;
 };
 
-enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked};
+enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init};
 
 /** A for loop from start to end by increment.
  * A vectorized loop will require the increment to be 1 and the
diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h
index 4e50764e9..c2c505bf5 100644
--- a/include/taco/ir/ir_printer.h
+++ b/include/taco/ir/ir_printer.h
@@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict {
 public:
   IRPrinter(std::ostream& stream);
   IRPrinter(std::ostream& stream, bool color, bool simplify);
+  IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify);
   virtual ~IRPrinter();
 
   void setColor(bool color);
@@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict {
   virtual void visit(const Break*);
 
   std::ostream &stream;
+  std::ostream &stream2;
   int indent;
   bool color;
   bool simplify;
@@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict {
   void doIndent();
   void printBinOp(Expr a, Expr b, std::string op, Precedence precedence);
   bool needsParentheses(Precedence precedence);
+  void sendToStream(std::stringstream &stream);
 
   std::string keywordString(std::string);
   std::string commentString(std::string);
diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h
index 5858a13e3..6a74be173 100644
--- a/include/taco/ir_tags.h
+++ b/include/taco/ir_tags.h
@@ -9,7 +9,7 @@ namespace taco {
 /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives
 /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread
 enum class ParallelUnit {
-  NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction
+  NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd
 };
 extern const char *ParallelUnit_NAMES[];
 
diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h
index 65f069fda..d743f5875 100644
--- a/include/taco/lower/lowerer_impl_imperative.h
+++ b/include/taco/lower/lowerer_impl_imperative.h
@@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl {
 
   bool emitUnderivedGuards = true;
 
+  int loopDepth = 0;
   int inParallelLoopDepth = 0;
 
   std::map<ParallelUnit, ir::Expr> parallelUnitSizes;
   std::map<ParallelUnit, IndexVar> parallelUnitIndexVars;
+  std::map<int, ParallelUnit> forUnits; // <loopdepth, ParallelUnit>
+  std::map<TensorVar,int> whereTempsWithLoopDepth;
 
   /// Keep track of what IndexVars have already been defined
   std::set<IndexVar> definedIndexVars;
diff --git a/include/taco/taco_tensor_t.h b/include/taco/taco_tensor_t.h
index 20d78bb51..f27acd9c7 100644
--- a/include/taco/taco_tensor_t.h
+++ b/include/taco/taco_tensor_t.h
@@ -6,6 +6,7 @@
 #ifndef TACO_TENSOR_T_DEFINED
 #define TACO_TENSOR_T_DEFINED
 
+#include <cstdint>
 #include <stdint.h>
 
 typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index b91782256..883718fb6 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -413,6 +413,8 @@ class TensorBase {
 
   /// Compile the tensor expression.
   void compile();
+  void compute(std::ofstream& statfile);
+  void compute(std::ofstream& statfile, std::string& sofile);
 
   void compile(IndexStmt stmt, bool assembleWhileCompute=false);
 
diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h
index 5dfb2f174..a3c3d863f 100644
--- a/include/taco/util/strings.h
+++ b/include/taco/util/strings.h
@@ -1,6 +1,7 @@
 #ifndef TACO_UTIL_STRINGS_H
 #define TACO_UTIL_STRINGS_H
 
+#include "taco/cuda.h"
 #include <string>
 #include <sstream>
 #include <vector>
@@ -8,6 +9,8 @@
 #include <iomanip>
 #include <limits>
 
+#include "taco/type.h"
+
 // To get the value of a compiler macro variable
 #define STRINGIFY(x) #x
 #define TO_STRING(x) STRINGIFY(x)
@@ -15,6 +18,25 @@
 namespace taco {
 namespace util {
 
+// /// Turn anything except floating points that can be written to a stream
+// /// into a string.
+// template <class T>
+// typename std::enable_if<!std::is_floating_point<T>::value, std::string>::type
+// toStringISPC(const T &val) {
+
+//   std::stringstream sstream;
+//   if (val == Int32) {
+//     sstream << "int32";
+//   }
+//   else if (val == Int64) {
+//     sstream << "int64";
+//   }
+//   else {
+//     sstream << val;
+//   }
+//   return sstream.str();
+// }
+
 /// Turn anything except floating points that can be written to a stream
 /// into a string.
 template <class T>
diff --git a/include/taco/version.h.in b/include/taco/version.h.in
index bc5559d7d..8ef507598 100644
--- a/include/taco/version.h.in
+++ b/include/taco/version.h.in
@@ -20,5 +20,6 @@
 #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@
 #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@
 #define TACO_FEATURE_CUDA   @TACO_FEATURE_CUDA@
+#define TACO_FEATURE_ISPC   @TACO_FEATURE_ISPC@
 
 #endif /* TACO_VERSION_H */
diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg
new file mode 100755
index 000000000..e88dbd51b
Binary files /dev/null and b/out/taco-uml/._taco.svg differ
diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg
new file mode 100644
index 000000000..57f7a18d1
--- /dev/null
+++ b/out/taco-uml/taco.svg
@@ -0,0 +1,878 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="1823px" preserveAspectRatio="none" style="width:3568px;height:1823px;background:#FFFFFF;" version="1.1" viewBox="0 0 3568 1823" width="3568px" zoomAndPan="magnify"><defs><filter height="300%" id="fujoep6dbpit" width="300%" x="-1" y="-1"><feGaussianBlur result="blurOut" stdDeviation="2.0"/><feColorMatrix in="blurOut" result="blurOut2" type="matrix" values="0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 .4 0"/><feOffset dx="4.0" dy="4.0" in="blurOut2" result="blurOut3"/><feBlend in="SourceGraphic" in2="blurOut3" mode="normal"/></filter></defs><g><!--MD5=[d414847e5e8717ca0c3531bdd138c8ba]
+class IntrusivePtr--><rect codeLine="4" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IntrusivePtr" style="stroke:#A80036;stroke-width:1.5;" width="103" x="632" y="7"/><ellipse cx="647" cy="23" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M649.9688,28.6406 Q649.3906,28.9375 648.75,29.0781 Q648.1094,29.2344 647.4063,29.2344 Q644.9063,29.2344 643.5781,27.5938 Q642.2656,25.9375 642.2656,22.8125 Q642.2656,19.6875 643.5781,18.0313 Q644.9063,16.375 647.4063,16.375 Q648.1094,16.375 648.75,16.5313 Q649.4063,16.6875 649.9688,16.9844 L649.9688,19.7031 Q649.3438,19.125 648.75,18.8594 Q648.1563,18.5781 647.5313,18.5781 Q646.1875,18.5781 645.5,19.6563 Q644.8125,20.7188 644.8125,22.8125 Q644.8125,24.9063 645.5,25.9844 Q646.1875,27.0469 647.5313,27.0469 Q648.1563,27.0469 648.75,26.7813 Q649.3438,26.5 649.9688,25.9219 L649.9688,28.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="661" y="27.1543">IntrusivePtr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="39" y2="39"/><ellipse cx="643" cy="50" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="33" x="652" y="53.2104">T *ptr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="59.8047" y2="59.8047"/><!--MD5=[9fb058d7a838b7ba6ed26398a5e03f68]
+class Uncopyable--><rect codeLine="7" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Uncopyable" style="stroke:#A80036;stroke-width:1.5;" width="105" x="786" y="244"/><ellipse cx="801" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M803.9688,265.6406 Q803.3906,265.9375 802.75,266.0781 Q802.1094,266.2344 801.4063,266.2344 Q798.9063,266.2344 797.5781,264.5938 Q796.2656,262.9375 796.2656,259.8125 Q796.2656,256.6875 797.5781,255.0313 Q798.9063,253.375 801.4063,253.375 Q802.1094,253.375 802.75,253.5313 Q803.4063,253.6875 803.9688,253.9844 L803.9688,256.7031 Q803.3438,256.125 802.75,255.8594 Q802.1563,255.5781 801.5313,255.5781 Q800.1875,255.5781 799.5,256.6563 Q798.8125,257.7188 798.8125,259.8125 Q798.8125,261.9063 799.5,262.9844 Q800.1875,264.0469 801.5313,264.0469 Q802.1563,264.0469 802.75,263.7813 Q803.3438,263.5 803.9688,262.9219 L803.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="73" x="815" y="264.1543">Uncopyable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="284" y2="284"/><!--MD5=[f38687c19e1720eba4a1ab1343a37015]
+class IRNode--><rect codeLine="9" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="73.6094" id="IRNode" style="stroke:#A80036;stroke-width:1.5;" width="288" x="197.5" y="548"/><ellipse cx="315.25" cy="564" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M318.2188,569.6406 Q317.6406,569.9375 317,570.0781 Q316.3594,570.2344 315.6563,570.2344 Q313.1563,570.2344 311.8281,568.5938 Q310.5156,566.9375 310.5156,563.8125 Q310.5156,560.6875 311.8281,559.0313 Q313.1563,557.375 315.6563,557.375 Q316.3594,557.375 317,557.5313 Q317.6563,557.6875 318.2188,557.9844 L318.2188,560.7031 Q317.5938,560.125 317,559.8594 Q316.4063,559.5781 315.7813,559.5781 Q314.4375,559.5781 313.75,560.6563 Q313.0625,561.7188 313.0625,563.8125 Q313.0625,565.9063 313.75,566.9844 Q314.4375,568.0469 315.7813,568.0469 Q316.4063,568.0469 317,567.7813 Q317.5938,567.5 318.2188,566.9219 L318.2188,569.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="44" x="335.75" y="568.1543">IRNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="580" y2="580"/><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="588" y2="588"/><ellipse cx="208.5" cy="599" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="217.5" y="602.2104">virtual void accept(IRVisitorStrict *v) const = 0</text><ellipse cx="208.5" cy="611.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="217.5" y="615.0151">virtual IRNodeType type_info() const = 0;</text><!--MD5=[bc9d8c255d7fbd519a9f6a6cf76a7a1b]
+class BaseStmtNode--><rect codeLine="14" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="BaseStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="125" x="110" y="830"/><ellipse cx="125" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M127.9688,851.6406 Q127.3906,851.9375 126.75,852.0781 Q126.1094,852.2344 125.4063,852.2344 Q122.9063,852.2344 121.5781,850.5938 Q120.2656,848.9375 120.2656,845.8125 Q120.2656,842.6875 121.5781,841.0313 Q122.9063,839.375 125.4063,839.375 Q126.1094,839.375 126.75,839.5313 Q127.4063,839.6875 127.9688,839.9844 L127.9688,842.7031 Q127.3438,842.125 126.75,841.8594 Q126.1563,841.5781 125.5313,841.5781 Q124.1875,841.5781 123.5,842.6563 Q122.8125,843.7188 122.8125,845.8125 Q122.8125,847.9063 123.5,848.9844 Q124.1875,850.0469 125.5313,850.0469 Q126.1563,850.0469 126.75,849.7813 Q127.3438,849.5 127.9688,848.9219 L127.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="139" y="850.1543">BaseStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="870" y2="870"/><!--MD5=[27b83928eb4ae87e2fc2e82e735e02cd]
+class BaseExprNode--><rect codeLine="15" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="BaseExprNode" style="stroke:#A80036;stroke-width:1.5;" width="123" x="315" y="823.5"/><ellipse cx="330" cy="839.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M332.9688,845.1406 Q332.3906,845.4375 331.75,845.5781 Q331.1094,845.7344 330.4063,845.7344 Q327.9063,845.7344 326.5781,844.0938 Q325.2656,842.4375 325.2656,839.3125 Q325.2656,836.1875 326.5781,834.5313 Q327.9063,832.875 330.4063,832.875 Q331.1094,832.875 331.75,833.0313 Q332.4063,833.1875 332.9688,833.4844 L332.9688,836.2031 Q332.3438,835.625 331.75,835.3594 Q331.1563,835.0781 330.5313,835.0781 Q329.1875,835.0781 328.5,836.1563 Q327.8125,837.2188 327.8125,839.3125 Q327.8125,841.4063 328.5,842.4844 Q329.1875,843.5469 330.5313,843.5469 Q331.1563,843.5469 331.75,843.2813 Q332.3438,843 332.9688,842.4219 L332.9688,845.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="91" x="344" y="843.6543">BaseExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="855.5" y2="855.5"/><ellipse cx="326" cy="866.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="78" x="335" y="869.7104">Datatype type</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="876.3047" y2="876.3047"/><!--MD5=[d94a4fdce57fa90edc62507e0f6859c0]
+class StmtNode--><rect codeLine="19" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="StmtNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="15" y="1198"/><ellipse cx="92.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M95.2188,1219.6406 Q94.6406,1219.9375 94,1220.0781 Q93.3594,1220.2344 92.6563,1220.2344 Q90.1563,1220.2344 88.8281,1218.5938 Q87.5156,1216.9375 87.5156,1213.8125 Q87.5156,1210.6875 88.8281,1209.0313 Q90.1563,1207.375 92.6563,1207.375 Q93.3594,1207.375 94,1207.5313 Q94.6563,1207.6875 95.2188,1207.9844 L95.2188,1210.7031 Q94.5938,1210.125 94,1209.8594 Q93.4063,1209.5781 92.7813,1209.5781 Q91.4375,1209.5781 90.75,1210.6563 Q90.0625,1211.7188 90.0625,1213.8125 Q90.0625,1215.9063 90.75,1216.9844 Q91.4375,1218.0469 92.7813,1218.0469 Q93.4063,1218.0469 94,1217.7813 Q94.5938,1217.5 95.2188,1216.9219 L95.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="112.75" y="1218.1543">StmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1238" y2="1238"/><ellipse cx="26" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="35" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[475d6310b0690b98eac8d3436b0f8c3b]
+class ExprNode--><rect codeLine="22" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="ExprNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="275" y="1198"/><ellipse cx="353.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M356.2188,1219.6406 Q355.6406,1219.9375 355,1220.0781 Q354.3594,1220.2344 353.6563,1220.2344 Q351.1563,1220.2344 349.8281,1218.5938 Q348.5156,1216.9375 348.5156,1213.8125 Q348.5156,1210.6875 349.8281,1209.0313 Q351.1563,1207.375 353.6563,1207.375 Q354.3594,1207.375 355,1207.5313 Q355.6563,1207.6875 356.2188,1207.9844 L356.2188,1210.7031 Q355.5938,1210.125 355,1209.8594 Q354.4063,1209.5781 353.7813,1209.5781 Q352.4375,1209.5781 351.75,1210.6563 Q351.0625,1211.7188 351.0625,1213.8125 Q351.0625,1215.9063 351.75,1216.9844 Q352.4375,1218.0469 353.7813,1218.0469 Q354.4063,1218.0469 355,1217.7813 Q355.5938,1217.5 356.2188,1216.9219 L356.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="373.75" y="1218.1543">ExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1238" y2="1238"/><ellipse cx="286" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="295" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[a8ff5e7d622655153c4b3f7a4e4aeffe]
+class IRHandle--><rect codeLine="32" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRHandle" style="stroke:#A80036;stroke-width:1.5;" width="225" x="72" y="237.5"/><ellipse cx="152.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M155.7188,259.1406 Q155.1406,259.4375 154.5,259.5781 Q153.8594,259.7344 153.1563,259.7344 Q150.6563,259.7344 149.3281,258.0938 Q148.0156,256.4375 148.0156,253.3125 Q148.0156,250.1875 149.3281,248.5313 Q150.6563,246.875 153.1563,246.875 Q153.8594,246.875 154.5,247.0313 Q155.1563,247.1875 155.7188,247.4844 L155.7188,250.2031 Q155.0938,249.625 154.5,249.3594 Q153.9063,249.0781 153.2813,249.0781 Q151.9375,249.0781 151.25,250.1563 Q150.5625,251.2188 150.5625,253.3125 Q150.5625,255.4063 151.25,256.4844 Q151.9375,257.5469 153.2813,257.5469 Q153.9063,257.5469 154.5,257.2813 Q155.0938,257 155.7188,256.4219 L155.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="173.25" y="257.6543">IRHandle</text><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="277.5" y2="277.5"/><ellipse cx="83" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="92" y="291.7104">void accept(IRVisitorStrict *v) const</text><!--MD5=[45d7a04dc863bc0ed8f0c57430a02d4a]
+class Expr--><rect codeLine="35" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Expr" style="stroke:#A80036;stroke-width:1.5;" width="59" x="7" y="561"/><ellipse cx="22" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M24.9688,582.6406 Q24.3906,582.9375 23.75,583.0781 Q23.1094,583.2344 22.4063,583.2344 Q19.9063,583.2344 18.5781,581.5938 Q17.2656,579.9375 17.2656,576.8125 Q17.2656,573.6875 18.5781,572.0313 Q19.9063,570.375 22.4063,570.375 Q23.1094,570.375 23.75,570.5313 Q24.4063,570.6875 24.9688,570.9844 L24.9688,573.7031 Q24.3438,573.125 23.75,572.8594 Q23.1563,572.5781 22.5313,572.5781 Q21.1875,572.5781 20.5,573.6563 Q19.8125,574.7188 19.8125,576.8125 Q19.8125,578.9063 20.5,579.9844 Q21.1875,581.0469 22.5313,581.0469 Q23.1563,581.0469 23.75,580.7813 Q24.3438,580.5 24.9688,579.9219 L24.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="27" x="36" y="581.1543">Expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="601" y2="601"/><!--MD5=[add513dd89cf3f02144ebc6704fab9f7]
+class Stmt--><rect codeLine="36" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Stmt" style="stroke:#A80036;stroke-width:1.5;" width="61" x="101" y="561"/><ellipse cx="116" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M118.9688,582.6406 Q118.3906,582.9375 117.75,583.0781 Q117.1094,583.2344 116.4063,583.2344 Q113.9063,583.2344 112.5781,581.5938 Q111.2656,579.9375 111.2656,576.8125 Q111.2656,573.6875 112.5781,572.0313 Q113.9063,570.375 116.4063,570.375 Q117.1094,570.375 117.75,570.5313 Q118.4063,570.6875 118.9688,570.9844 L118.9688,573.7031 Q118.3438,573.125 117.75,572.8594 Q117.1563,572.5781 116.5313,572.5781 Q115.1875,572.5781 114.5,573.6563 Q113.8125,574.7188 113.8125,576.8125 Q113.8125,578.9063 114.5,579.9844 Q115.1875,581.0469 116.5313,581.0469 Q117.1563,581.0469 117.75,580.7813 Q118.3438,580.5 118.9688,579.9219 L118.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="29" x="130" y="581.1543">Stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="601" y2="601"/><!--MD5=[927685d34b77cdaffb6bcd7c2ecdcc1a]
+class IRVisitorStrict--><rect codeLine="47" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="262" x="2676.5" y="7"/><ellipse cx="2761.75" cy="23" fill="#B4A7E5" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2757.6719,18.7656 L2757.6719,16.6094 L2765.0625,16.6094 L2765.0625,18.7656 L2762.5938,18.7656 L2762.5938,26.8438 L2765.0625,26.8438 L2765.0625,29 L2757.6719,29 L2757.6719,26.8438 L2760.1406,26.8438 L2760.1406,18.7656 L2757.6719,18.7656 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="83" x="2782.25" y="27.1543">IRVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="39" y2="39"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="47" y2="47"/><ellipse cx="2687.5" cy="58" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="2696.5" y="61.2104">virtual void visit(const IRNode*) const = 0</text><!--MD5=[b78282c203133343885c01c420157c8a]
+class IRVisitor--><rect codeLine="55" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitor" style="stroke:#A80036;stroke-width:1.5;" width="203" x="2387" y="237.5"/><ellipse cx="2459.25" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2462.2188,259.1406 Q2461.6406,259.4375 2461,259.5781 Q2460.3594,259.7344 2459.6563,259.7344 Q2457.1563,259.7344 2455.8281,258.0938 Q2454.5156,256.4375 2454.5156,253.3125 Q2454.5156,250.1875 2455.8281,248.5313 Q2457.1563,246.875 2459.6563,246.875 Q2460.3594,246.875 2461,247.0313 Q2461.6563,247.1875 2462.2188,247.4844 L2462.2188,250.2031 Q2461.5938,249.625 2461,249.3594 Q2460.4063,249.0781 2459.7813,249.0781 Q2458.4375,249.0781 2457.75,250.1563 Q2457.0625,251.2188 2457.0625,253.3125 Q2457.0625,255.4063 2457.75,256.4844 Q2458.4375,257.5469 2459.7813,257.5469 Q2460.4063,257.5469 2461,257.2813 Q2461.5938,257 2462.2188,256.4219 L2462.2188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="50" x="2479.75" y="257.6543">IRVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="277.5" y2="277.5"/><ellipse cx="2398" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="2407" y="291.7104">virtual void visit(const IRNode*)</text><!--MD5=[e7ea7c5d2ec9672a3f65e9628a854185]
+class IRRewriter--><rect codeLine="59" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="137.6328" id="IRRewriter" style="stroke:#A80036;stroke-width:1.5;" width="238" x="2688.5" y="199"/><ellipse cx="2772.25" cy="215" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2775.2188,220.6406 Q2774.6406,220.9375 2774,221.0781 Q2773.3594,221.2344 2772.6563,221.2344 Q2770.1563,221.2344 2768.8281,219.5938 Q2767.5156,217.9375 2767.5156,214.8125 Q2767.5156,211.6875 2768.8281,210.0313 Q2770.1563,208.375 2772.6563,208.375 Q2773.3594,208.375 2774,208.5313 Q2774.6563,208.6875 2775.2188,208.9844 L2775.2188,211.7031 Q2774.5938,211.125 2774,210.8594 Q2773.4063,210.5781 2772.7813,210.5781 Q2771.4375,210.5781 2770.75,211.6563 Q2770.0625,212.7188 2770.0625,214.8125 Q2770.0625,216.9063 2770.75,217.9844 Q2771.4375,219.0469 2772.7813,219.0469 Q2773.4063,219.0469 2774,218.7813 Q2774.5938,218.5 2775.2188,217.9219 L2775.2188,220.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="2792.75" y="219.1543">IRRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="231" y2="231"/><polygon fill="none" points="2699.5,237,2703.5,241,2699.5,245,2695.5,241" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="2708.5" y="245.2104">Expr expr</text><polygon fill="none" points="2699.5,249.8047,2703.5,253.8047,2699.5,257.8047,2695.5,253.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="2708.5" y="258.0151">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="264.6094" y2="264.6094"/><polygon fill="#FFFF44" points="2699.5,270.6094,2703.5,274.6094,2699.5,278.6094,2695.5,274.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="2708.5" y="278.8198">virtual void visit(const ExprNode* op)</text><polygon fill="#FFFF44" points="2699.5,283.4141,2703.5,287.4141,2699.5,291.4141,2695.5,287.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="2708.5" y="291.6245">virtual void visit(const StmtNode* op)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="2712.5" y="304.4292"/><ellipse cx="2699.5" cy="314.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="2708.5" y="317.2339">Expr rewrite(Expr)</text><ellipse cx="2699.5" cy="326.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="106" x="2708.5" y="330.0386">Stmt rewrite(Stmt)</text><!--MD5=[fc5b2d51c8ad612433d8a39d4bdd37c4]
+class IRPrinter--><rect codeLine="71" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="278.4844" id="IRPrinter" style="stroke:#A80036;stroke-width:1.5;" width="430" x="3008.5" y="129"/><ellipse cx="3192.75" cy="145" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3195.7188,150.6406 Q3195.1406,150.9375 3194.5,151.0781 Q3193.8594,151.2344 3193.1563,151.2344 Q3190.6563,151.2344 3189.3281,149.5938 Q3188.0156,147.9375 3188.0156,144.8125 Q3188.0156,141.6875 3189.3281,140.0313 Q3190.6563,138.375 3193.1563,138.375 Q3193.8594,138.375 3194.5,138.5313 Q3195.1563,138.6875 3195.7188,138.9844 L3195.7188,141.7031 Q3195.0938,141.125 3194.5,140.8594 Q3193.9063,140.5781 3193.2813,140.5781 Q3191.9375,140.5781 3191.25,141.6563 Q3190.5625,142.7188 3190.5625,144.8125 Q3190.5625,146.9063 3191.25,147.9844 Q3191.9375,149.0469 3193.2813,149.0469 Q3193.9063,149.0469 3194.5,148.7813 Q3195.0938,148.5 3195.7188,147.9219 L3195.7188,150.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="53" x="3213.25" y="149.1543">IRPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="161" y2="161"/><polygon fill="none" points="3019.5,167,3023.5,171,3019.5,175,3015.5,171" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="129" x="3028.5" y="175.2104">std::ostream &amp;stream</text><polygon fill="none" points="3019.5,179.8047,3023.5,183.8047,3019.5,187.8047,3015.5,183.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="136" x="3028.5" y="188.0151">std::ostream &amp;stream2</text><polygon fill="none" points="3019.5,192.6094,3023.5,196.6094,3019.5,200.6094,3015.5,196.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="53" x="3028.5" y="200.8198">int indent</text><polygon fill="none" points="3019.5,205.4141,3023.5,209.4141,3019.5,213.4141,3015.5,209.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="56" x="3028.5" y="213.6245">bool color</text><polygon fill="none" points="3019.5,218.2188,3023.5,222.2188,3019.5,226.2188,3015.5,222.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="72" x="3028.5" y="226.4292">bool simplify</text><polygon fill="none" points="3019.5,231.0234,3023.5,235.0234,3019.5,239.0234,3015.5,235.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="3028.5" y="239.2339">enum Precedence</text><polygon fill="none" points="3019.5,243.8281,3023.5,247.8281,3019.5,251.8281,3015.5,247.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="3028.5" y="252.0386">Precedence parentPrecedence = BOTTOM</text><polygon fill="none" points="3019.5,256.6328,3023.5,260.6328,3019.5,264.6328,3015.5,260.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="204" x="3028.5" y="264.8433">NameGenerator varNameGenerator</text><polygon fill="none" points="3019.5,269.4375,3023.5,273.4375,3019.5,277.4375,3015.5,273.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="237" x="3028.5" y="277.6479">scopedMap&lt;Expr, std::String&gt; varNames</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="284.2422" y2="284.2422"/><polygon fill="#FFFF44" points="3019.5,290.2422,3023.5,294.2422,3019.5,298.2422,3015.5,294.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="3028.5" y="298.4526">void doIndent()</text><polygon fill="#FFFF44" points="3019.5,303.0469,3023.5,307.0469,3019.5,311.0469,3015.5,307.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="404" x="3028.5" y="311.2573">void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)</text><polygon fill="#FFFF44" points="3019.5,315.8516,3023.5,319.8516,3019.5,323.8516,3015.5,319.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="131" x="3028.5" y="324.062">void fewMoreMethods()</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="336.8667"/><polygon fill="#FFFF44" points="3019.5,341.4609,3023.5,345.4609,3019.5,349.4609,3015.5,345.4609" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="3028.5" y="349.6714">virtual void visit(const ExprNode*)</text><polygon fill="#FFFF44" points="3019.5,354.2656,3023.5,358.2656,3019.5,362.2656,3015.5,358.2656" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="194" x="3028.5" y="362.4761">virtual void visit(const StmtNode*)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="375.2808"/><ellipse cx="3019.5" cy="384.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="112" x="3028.5" y="388.0854">setColor(bool color)</text><ellipse cx="3019.5" cy="397.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="61" x="3028.5" y="400.8901">print(Stmt)</text><!--MD5=[5f8d54360f7c21960948de60fa30257d]
+class IRVerifier--><rect codeLine="92" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IRVerifier" style="stroke:#A80036;stroke-width:1.5;" width="87" x="2288" y="561"/><ellipse cx="2303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2305.9688,582.6406 Q2305.3906,582.9375 2304.75,583.0781 Q2304.1094,583.2344 2303.4063,583.2344 Q2300.9063,583.2344 2299.5781,581.5938 Q2298.2656,579.9375 2298.2656,576.8125 Q2298.2656,573.6875 2299.5781,572.0313 Q2300.9063,570.375 2303.4063,570.375 Q2304.1094,570.375 2304.75,570.5313 Q2305.4063,570.6875 2305.9688,570.9844 L2305.9688,573.7031 Q2305.3438,573.125 2304.75,572.8594 Q2304.1563,572.5781 2303.5313,572.5781 Q2302.1875,572.5781 2301.5,573.6563 Q2300.8125,574.7188 2300.8125,576.8125 Q2300.8125,578.9063 2301.5,579.9844 Q2302.1875,581.0469 2303.5313,581.0469 Q2304.1563,581.0469 2304.75,580.7813 Q2305.3438,580.5 2305.9688,579.9219 L2305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="2317" y="581.1543">IRVerifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="601" y2="601"/><!--MD5=[1e59d9c8d5cb32d21caddc96a281f60c]
+class ExpressionSimplifier--><rect codeLine="101" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.5;" width="156" x="2410.5" y="561"/><ellipse cx="2425.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2428.4688,582.6406 Q2427.8906,582.9375 2427.25,583.0781 Q2426.6094,583.2344 2425.9063,583.2344 Q2423.4063,583.2344 2422.0781,581.5938 Q2420.7656,579.9375 2420.7656,576.8125 Q2420.7656,573.6875 2422.0781,572.0313 Q2423.4063,570.375 2425.9063,570.375 Q2426.6094,570.375 2427.25,570.5313 Q2427.9063,570.6875 2428.4688,570.9844 L2428.4688,573.7031 Q2427.8438,573.125 2427.25,572.8594 Q2426.6563,572.5781 2426.0313,572.5781 Q2424.6875,572.5781 2424,573.6563 Q2423.3125,574.7188 2423.3125,576.8125 Q2423.3125,578.9063 2424,579.9844 Q2424.6875,581.0469 2426.0313,581.0469 Q2426.6563,581.0469 2427.25,580.7813 Q2427.8438,580.5 2428.4688,579.9219 L2428.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="124" x="2439.5" y="581.1543">ExpressionSimplifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="601" y2="601"/><!--MD5=[09d0ace23740abc72ce7e8b4f8ae65c7]
+class RemoveRedundantStatements--><rect codeLine="105" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.5;" width="223" x="2602" y="561"/><ellipse cx="2617" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2619.9688,582.6406 Q2619.3906,582.9375 2618.75,583.0781 Q2618.1094,583.2344 2617.4063,583.2344 Q2614.9063,583.2344 2613.5781,581.5938 Q2612.2656,579.9375 2612.2656,576.8125 Q2612.2656,573.6875 2613.5781,572.0313 Q2614.9063,570.375 2617.4063,570.375 Q2618.1094,570.375 2618.75,570.5313 Q2619.4063,570.6875 2619.9688,570.9844 L2619.9688,573.7031 Q2619.3438,573.125 2618.75,572.8594 Q2618.1563,572.5781 2617.5313,572.5781 Q2616.1875,572.5781 2615.5,573.6563 Q2614.8125,574.7188 2614.8125,576.8125 Q2614.8125,578.9063 2615.5,579.9844 Q2616.1875,581.0469 2617.5313,581.0469 Q2618.1563,581.0469 2618.75,580.7813 Q2619.3438,580.5 2619.9688,579.9219 L2619.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="191" x="2631" y="581.1543">RemoveRedundantStatements</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="601" y2="601"/><!--MD5=[8dd11208bc782b9bc4fe9a727775ac71]
+class RemoveRedundantLoops--><rect codeLine="106" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.5;" width="187" x="2860" y="561"/><ellipse cx="2875" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2877.9688,582.6406 Q2877.3906,582.9375 2876.75,583.0781 Q2876.1094,583.2344 2875.4063,583.2344 Q2872.9063,583.2344 2871.5781,581.5938 Q2870.2656,579.9375 2870.2656,576.8125 Q2870.2656,573.6875 2871.5781,572.0313 Q2872.9063,570.375 2875.4063,570.375 Q2876.1094,570.375 2876.75,570.5313 Q2877.4063,570.6875 2877.9688,570.9844 L2877.9688,573.7031 Q2877.3438,573.125 2876.75,572.8594 Q2876.1563,572.5781 2875.5313,572.5781 Q2874.1875,572.5781 2873.5,573.6563 Q2872.8125,574.7188 2872.8125,576.8125 Q2872.8125,578.9063 2873.5,579.9844 Q2874.1875,581.0469 2875.5313,581.0469 Q2876.1563,581.0469 2876.75,580.7813 Q2877.3438,580.5 2877.9688,579.9219 L2877.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="155" x="2889" y="581.1543">RemoveRedundantLoops</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="601" y2="601"/><!--MD5=[85eaa2c6ee966b219cfed7e8ed27a206]
+class RemoveDuplicateBody--><rect codeLine="107" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.5;" width="170" x="3082.5" y="561"/><ellipse cx="3097.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3100.4688,582.6406 Q3099.8906,582.9375 3099.25,583.0781 Q3098.6094,583.2344 3097.9063,583.2344 Q3095.4063,583.2344 3094.0781,581.5938 Q3092.7656,579.9375 3092.7656,576.8125 Q3092.7656,573.6875 3094.0781,572.0313 Q3095.4063,570.375 3097.9063,570.375 Q3098.6094,570.375 3099.25,570.5313 Q3099.9063,570.6875 3100.4688,570.9844 L3100.4688,573.7031 Q3099.8438,573.125 3099.25,572.8594 Q3098.6563,572.5781 3098.0313,572.5781 Q3096.6875,572.5781 3096,573.6563 Q3095.3125,574.7188 3095.3125,576.8125 Q3095.3125,578.9063 3096,579.9844 Q3096.6875,581.0469 3098.0313,581.0469 Q3098.6563,581.0469 3099.25,580.7813 Q3099.8438,580.5 3100.4688,579.9219 L3100.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="3111.5" y="581.1543">RemoveDuplicateBody</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="601" y2="601"/><!--MD5=[781eb37a56bb69dce1ac0e85789010ac]
+class CodeGen--><rect codeLine="115" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen" style="stroke:#A80036;stroke-width:1.5;" width="89" x="3288" y="561"/><ellipse cx="3303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3305.9688,582.6406 Q3305.3906,582.9375 3304.75,583.0781 Q3304.1094,583.2344 3303.4063,583.2344 Q3300.9063,583.2344 3299.5781,581.5938 Q3298.2656,579.9375 3298.2656,576.8125 Q3298.2656,573.6875 3299.5781,572.0313 Q3300.9063,570.375 3303.4063,570.375 Q3304.1094,570.375 3304.75,570.5313 Q3305.4063,570.6875 3305.9688,570.9844 L3305.9688,573.7031 Q3305.3438,573.125 3304.75,572.8594 Q3304.1563,572.5781 3303.5313,572.5781 Q3302.1875,572.5781 3301.5,573.6563 Q3300.8125,574.7188 3300.8125,576.8125 Q3300.8125,578.9063 3301.5,579.9844 Q3302.1875,581.0469 3303.5313,581.0469 Q3304.1563,581.0469 3304.75,580.7813 Q3305.3438,580.5 3305.9688,579.9219 L3305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="57" x="3317" y="581.1543">CodeGen</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="601" y2="601"/><!--MD5=[1c66665a05557eaba0ef54dbe8329f75]
+class CodeGen_C--><rect codeLine="116" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_C" style="stroke:#A80036;stroke-width:1.5;" width="103" x="3130" y="830"/><ellipse cx="3145" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3147.9688,851.6406 Q3147.3906,851.9375 3146.75,852.0781 Q3146.1094,852.2344 3145.4063,852.2344 Q3142.9063,852.2344 3141.5781,850.5938 Q3140.2656,848.9375 3140.2656,845.8125 Q3140.2656,842.6875 3141.5781,841.0313 Q3142.9063,839.375 3145.4063,839.375 Q3146.1094,839.375 3146.75,839.5313 Q3147.4063,839.6875 3147.9688,839.9844 L3147.9688,842.7031 Q3147.3438,842.125 3146.75,841.8594 Q3146.1563,841.5781 3145.5313,841.5781 Q3144.1875,841.5781 3143.5,842.6563 Q3142.8125,843.7188 3142.8125,845.8125 Q3142.8125,847.9063 3143.5,848.9844 Q3144.1875,850.0469 3145.5313,850.0469 Q3146.1563,850.0469 3146.75,849.7813 Q3147.3438,849.5 3147.9688,848.9219 L3147.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="3159" y="850.1543">CodeGen_C</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="870" y2="870"/><!--MD5=[b05ffbf1810bcc29bd244a8644dcab5e]
+class CodeGen_CUDA--><rect codeLine="117" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.5;" width="129" x="3268" y="830"/><ellipse cx="3283" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3285.9688,851.6406 Q3285.3906,851.9375 3284.75,852.0781 Q3284.1094,852.2344 3283.4063,852.2344 Q3280.9063,852.2344 3279.5781,850.5938 Q3278.2656,848.9375 3278.2656,845.8125 Q3278.2656,842.6875 3279.5781,841.0313 Q3280.9063,839.375 3283.4063,839.375 Q3284.1094,839.375 3284.75,839.5313 Q3285.4063,839.6875 3285.9688,839.9844 L3285.9688,842.7031 Q3285.3438,842.125 3284.75,841.8594 Q3284.1563,841.5781 3283.5313,841.5781 Q3282.1875,841.5781 3281.5,842.6563 Q3280.8125,843.7188 3280.8125,845.8125 Q3280.8125,847.9063 3281.5,848.9844 Q3282.1875,850.0469 3283.5313,850.0469 Q3284.1563,850.0469 3284.75,849.7813 Q3285.3438,849.5 3285.9688,848.9219 L3285.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="97" x="3297" y="850.1543">CodeGen_CUDA</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="870" y2="870"/><!--MD5=[e6fabe1c34e0f779d9281ebc64edf122]
+class CodeGen_ISPC--><rect codeLine="118" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.5;" width="122" x="3432.5" y="830"/><ellipse cx="3447.5" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3450.4688,851.6406 Q3449.8906,851.9375 3449.25,852.0781 Q3448.6094,852.2344 3447.9063,852.2344 Q3445.4063,852.2344 3444.0781,850.5938 Q3442.7656,848.9375 3442.7656,845.8125 Q3442.7656,842.6875 3444.0781,841.0313 Q3445.4063,839.375 3447.9063,839.375 Q3448.6094,839.375 3449.25,839.5313 Q3449.9063,839.6875 3450.4688,839.9844 L3450.4688,842.7031 Q3449.8438,842.125 3449.25,841.8594 Q3448.6563,841.5781 3448.0313,841.5781 Q3446.6875,841.5781 3446,842.6563 Q3445.3125,843.7188 3445.3125,845.8125 Q3445.3125,847.9063 3446,848.9844 Q3446.6875,850.0469 3448.0313,850.0469 Q3448.6563,850.0469 3449.25,849.7813 Q3449.8438,849.5 3450.4688,848.9219 L3450.4688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="90" x="3461.5" y="850.1543">CodeGen_ISPC</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="870" y2="870"/><!--MD5=[a8e9f8a103380e23aa8687dbc5a94fb7]
+class Manageable--><rect codeLine="126" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Manageable" style="stroke:#A80036;stroke-width:1.5;" width="109" x="1221" y="244"/><ellipse cx="1236" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1238.9688,265.6406 Q1238.3906,265.9375 1237.75,266.0781 Q1237.1094,266.2344 1236.4063,266.2344 Q1233.9063,266.2344 1232.5781,264.5938 Q1231.2656,262.9375 1231.2656,259.8125 Q1231.2656,256.6875 1232.5781,255.0313 Q1233.9063,253.375 1236.4063,253.375 Q1237.1094,253.375 1237.75,253.5313 Q1238.4063,253.6875 1238.9688,253.9844 L1238.9688,256.7031 Q1238.3438,256.125 1237.75,255.8594 Q1237.1563,255.5781 1236.5313,255.5781 Q1235.1875,255.5781 1234.5,256.6563 Q1233.8125,257.7188 1233.8125,259.8125 Q1233.8125,261.9063 1234.5,262.9844 Q1235.1875,264.0469 1236.5313,264.0469 Q1237.1563,264.0469 1237.75,263.7813 Q1238.3438,263.5 1238.9688,262.9219 L1238.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="77" x="1250" y="264.1543">Manageable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="284" y2="284"/><!--MD5=[b230114a6dc80ef25a3e5e6e95ae886a]
+class IndexStmtNode--><rect codeLine="127" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="521" y="554.5"/><ellipse cx="631.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M634.7188,576.1406 Q634.1406,576.4375 633.5,576.5781 Q632.8594,576.7344 632.1563,576.7344 Q629.6563,576.7344 628.3281,575.0938 Q627.0156,573.4375 627.0156,570.3125 Q627.0156,567.1875 628.3281,565.5313 Q629.6563,563.875 632.1563,563.875 Q632.8594,563.875 633.5,564.0313 Q634.1563,564.1875 634.7188,564.4844 L634.7188,567.2031 Q634.0938,566.625 633.5,566.3594 Q632.9063,566.0781 632.2813,566.0781 Q630.9375,566.0781 630.25,567.1563 Q629.5625,568.2188 629.5625,570.3125 Q629.5625,572.4063 630.25,573.4844 Q630.9375,574.5469 632.2813,574.5469 Q632.9063,574.5469 633.5,574.2813 Q634.0938,574 634.7188,573.4219 L634.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="95" x="652.25" y="574.6543">IndexStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="529" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="541" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[d94a097bbd14b86b446d6c306c6327b3]
+class IndexExprNode--><rect codeLine="130" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexExprNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="1281" y="554.5"/><ellipse cx="1392.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1395.7188,576.1406 Q1395.1406,576.4375 1394.5,576.5781 Q1393.8594,576.7344 1393.1563,576.7344 Q1390.6563,576.7344 1389.3281,575.0938 Q1388.0156,573.4375 1388.0156,570.3125 Q1388.0156,567.1875 1389.3281,565.5313 Q1390.6563,563.875 1393.1563,563.875 Q1393.8594,563.875 1394.5,564.0313 Q1395.1563,564.1875 1395.7188,564.4844 L1395.7188,567.2031 Q1395.0938,566.625 1394.5,566.3594 Q1393.9063,566.0781 1393.2813,566.0781 Q1391.9375,566.0781 1391.25,567.1563 Q1390.5625,568.2188 1390.5625,570.3125 Q1390.5625,572.4063 1391.25,573.4844 Q1391.9375,574.5469 1393.2813,574.5469 Q1393.9063,574.5469 1394.5,574.2813 Q1395.0938,574 1395.7188,573.4219 L1395.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="1413.25" y="574.6543">IndexExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="1289" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="1301" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[2ae3d0d839308205eb4a3976239628b6]
+class IndexStmt--><rect codeLine="140" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexStmt" style="stroke:#A80036;stroke-width:1.5;" width="94" x="636.5" y="244"/><ellipse cx="651.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M654.4688,265.6406 Q653.8906,265.9375 653.25,266.0781 Q652.6094,266.2344 651.9063,266.2344 Q649.4063,266.2344 648.0781,264.5938 Q646.7656,262.9375 646.7656,259.8125 Q646.7656,256.6875 648.0781,255.0313 Q649.4063,253.375 651.9063,253.375 Q652.6094,253.375 653.25,253.5313 Q653.9063,253.6875 654.4688,253.9844 L654.4688,256.7031 Q653.8438,256.125 653.25,255.8594 Q652.6563,255.5781 652.0313,255.5781 Q650.6875,255.5781 650,256.6563 Q649.3125,257.7188 649.3125,259.8125 Q649.3125,261.9063 650,262.9844 Q650.6875,264.0469 652.0313,264.0469 Q652.6563,264.0469 653.25,263.7813 Q653.8438,263.5 654.4688,262.9219 L654.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="665.5" y="264.1543">IndexStmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="284" y2="284"/><!--MD5=[97c64a8910e96953a95fad8b92c83bb0]
+class IndexExpr--><rect codeLine="141" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexExpr" style="stroke:#A80036;stroke-width:1.5;" width="92" x="1374.5" y="244"/><ellipse cx="1389.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1392.4688,265.6406 Q1391.8906,265.9375 1391.25,266.0781 Q1390.6094,266.2344 1389.9063,266.2344 Q1387.4063,266.2344 1386.0781,264.5938 Q1384.7656,262.9375 1384.7656,259.8125 Q1384.7656,256.6875 1386.0781,255.0313 Q1387.4063,253.375 1389.9063,253.375 Q1390.6094,253.375 1391.25,253.5313 Q1391.9063,253.6875 1392.4688,253.9844 L1392.4688,256.7031 Q1391.8438,256.125 1391.25,255.8594 Q1390.6563,255.5781 1390.0313,255.5781 Q1388.6875,255.5781 1388,256.6563 Q1387.3125,257.7188 1387.3125,259.8125 Q1387.3125,261.9063 1388,262.9844 Q1388.6875,264.0469 1390.0313,264.0469 Q1390.6563,264.0469 1391.25,263.7813 Q1391.8438,263.5 1392.4688,262.9219 L1392.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="1403.5" y="264.1543">IndexExpr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="284" y2="284"/><!--MD5=[28b0f4e593c8487512a9debc1bac1917]
+class IndexExprVisitorStrict--><rect codeLine="149" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="201.6563" id="IndexExprVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="283" x="1641" y="484"/><ellipse cx="1711.25" cy="500" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.3594,495.3438 L1710.2031,500.4219 L1712.5313,500.4219 L1711.3594,495.3438 Z M1709.875,493.1094 L1712.8594,493.1094 L1716.2188,505.5 L1713.7656,505.5 L1713,502.4375 L1709.7188,502.4375 L1708.9688,505.5 L1706.5313,505.5 L1709.875,493.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="134" x="1731.75" y="504.1543">IndexExprVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="516" y2="516"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="524" y2="524"/><ellipse cx="1652" cy="535" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1661" y="538.2104">void visit(const IndexStmt&amp;)</text><ellipse cx="1652" cy="547.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1661" y="551.0151">virtual void visit(const AccessNode*) = 0</text><ellipse cx="1652" cy="560.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1661" y="563.8198">virtual void visit(const LiteralNode*) = 0</text><ellipse cx="1652" cy="573.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="576.6245">virtual void visit(const NegNode*) = 0</text><ellipse cx="1652" cy="586.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="1661" y="589.4292">virtual void visit(const AddNode*) = 0</text><ellipse cx="1652" cy="599.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="602.2339">virtual void visit(const SubNode*) = 0</text><ellipse cx="1652" cy="611.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="1661" y="615.0386">virtual void visit(const MulNode*) = 0</text><ellipse cx="1652" cy="624.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1661" y="627.8433">virtual void visit(const DivNode*) = 0</text><ellipse cx="1652" cy="637.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1661" y="640.6479">virtual void visit(const SqrtNode*) = 0</text><ellipse cx="1652" cy="650.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1661" y="653.4526">virtual void visit(const CastNode*) = 0</text><ellipse cx="1652" cy="663.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1661" y="666.2573">virtual void visit(const CallIntrinsicNode*) = 0</text><ellipse cx="1652" cy="675.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="246" x="1661" y="679.062">virtual void visit(const ReductionNode*) = 0</text><!--MD5=[a89aadb6ea0d27c41410991969988628]
+class IndexStmtVisitorStrict--><rect codeLine="163" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="163.2422" id="IndexStmtVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="284" x="1968.5" y="503.5"/><ellipse cx="2038.75" cy="519.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2038.8594,514.8438 L2037.7031,519.9219 L2040.0313,519.9219 L2038.8594,514.8438 Z M2037.375,512.6094 L2040.3594,512.6094 L2043.7188,525 L2041.2656,525 L2040.5,521.9375 L2037.2188,521.9375 L2036.4688,525 L2034.0313,525 L2037.375,512.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="135" x="2059.25" y="523.6543">IndexStmtVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="535.5" y2="535.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="543.5" y2="543.5"/><ellipse cx="1979.5" cy="554.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1988.5" y="557.7104">void visit(const IndexStmt&amp;)</text><ellipse cx="1979.5" cy="567.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="258" x="1988.5" y="570.5151">virtual void visit(const AssignmentNode*) = 0</text><ellipse cx="1979.5" cy="580.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1988.5" y="583.3198">virtual void visit(const YieldNode*) = 0</text><ellipse cx="1979.5" cy="592.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="222" x="1988.5" y="596.1245">virtual void visit(const ForallNode*) = 0</text><ellipse cx="1979.5" cy="605.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1988.5" y="608.9292">virtual void visit(const WhereNode*) = 0</text><ellipse cx="1979.5" cy="618.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="621.7339">virtual void visit(const SequenceNode*) = 0</text><ellipse cx="1979.5" cy="631.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="634.5386">virtual void visit(const AssembleNode*) = 0</text><ellipse cx="1979.5" cy="644.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1988.5" y="647.3433">virtual void visit(const MultiNode*) = 0</text><ellipse cx="1979.5" cy="656.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="660.1479">virtual void visit(const SuchThatNode*) = 0</text><!--MD5=[b74718248e125c8ad329889fd2a32c16]
+class IndexNotationVisitorStrict--><rect codeLine="175" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="192" x="1404.5" y="830"/><ellipse cx="1419.5" cy="846" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1419.6094,841.3438 L1418.4531,846.4219 L1420.7813,846.4219 L1419.6094,841.3438 Z M1418.125,839.1094 L1421.1094,839.1094 L1424.4688,851.5 L1422.0156,851.5 L1421.25,848.4375 L1417.9688,848.4375 L1417.2188,851.5 L1414.7813,851.5 L1418.125,839.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="160" x="1433.5" y="850.1543">IndexNotationVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="870" y2="870"/><!--MD5=[cb464207dbcea0ece296242645495747]
+class IndexNotationPrinter--><rect codeLine="176" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="368.1172" id="IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.5;" width="253" x="1301" y="1044.5"/><ellipse cx="1358.75" cy="1060.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1361.7188,1066.1406 Q1361.1406,1066.4375 1360.5,1066.5781 Q1359.8594,1066.7344 1359.1563,1066.7344 Q1356.6563,1066.7344 1355.3281,1065.0938 Q1354.0156,1063.4375 1354.0156,1060.3125 Q1354.0156,1057.1875 1355.3281,1055.5313 Q1356.6563,1053.875 1359.1563,1053.875 Q1359.8594,1053.875 1360.5,1054.0313 Q1361.1563,1054.1875 1361.7188,1054.4844 L1361.7188,1057.2031 Q1361.0938,1056.625 1360.5,1056.3594 Q1359.9063,1056.0781 1359.2813,1056.0781 Q1357.9375,1056.0781 1357.25,1057.1563 Q1356.5625,1058.2188 1356.5625,1060.3125 Q1356.5625,1062.4063 1357.25,1063.4844 Q1357.9375,1064.5469 1359.2813,1064.5469 Q1359.9063,1064.5469 1360.5,1064.2813 Q1361.0938,1064 1361.7188,1063.4219 L1361.7188,1066.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="129" x="1379.25" y="1064.6543">IndexNotationPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1076.5" y2="1076.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1084.5" y2="1084.5"/><ellipse cx="1312" cy="1095.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="190" x="1321" y="1098.7104">void print(const IndexExpr&amp; expr)</text><ellipse cx="1312" cy="1108.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="1321" y="1111.5151">void print(const IndexStmt&amp; expr)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1124.3198"/><ellipse cx="1312" cy="1133.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1321" y="1137.1245">void visit(const AccessNode* node)</text><ellipse cx="1312" cy="1146.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1149.9292">void visit(const LiteralNode* node)</text><ellipse cx="1312" cy="1159.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1162.7339">void visit(const NegNode* node)</text><ellipse cx="1312" cy="1172.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1321" y="1175.5386">void visit(const AddNode* node)</text><ellipse cx="1312" cy="1185.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1188.3433">void visit(const SubNode* node)</text><ellipse cx="1312" cy="1197.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1321" y="1201.1479">void visit(const MulNode* node)</text><ellipse cx="1312" cy="1210.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1321" y="1213.9526">void visit(const DivNode* node)</text><ellipse cx="1312" cy="1223.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1321" y="1226.7573">void visit(const SqrtNode* node)</text><ellipse cx="1312" cy="1236.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1239.562">void visit(const CastNode* node)</text><ellipse cx="1312" cy="1249.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1321" y="1252.3667">void visit(const CallIntrinsicNode* node)</text><ellipse cx="1312" cy="1261.9609" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1321" y="1265.1714">void visit(const UnaryExprNode* node)</text><ellipse cx="1312" cy="1274.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1321" y="1277.9761">void visit(const BinaryExprNode* node)</text><ellipse cx="1312" cy="1287.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1321" y="1290.7808">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1303.5854"/><ellipse cx="1312" cy="1313.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1321" y="1316.3901">void visit(const AssignmentNode* node)</text><ellipse cx="1312" cy="1325.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1321" y="1329.1948">void visit(const YieldNode* node)</text><ellipse cx="1312" cy="1338.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1321" y="1341.9995">void visit(const ForallNode* node)</text><ellipse cx="1312" cy="1351.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1354.8042">void visit(const WhereNode* node)</text><ellipse cx="1312" cy="1364.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1367.6089">void visit(const SequenceNode* node)</text><ellipse cx="1312" cy="1377.2031" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1380.4136">void visit(const AssembleNode* node)</text><ellipse cx="1312" cy="1390.0078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1393.2183">void visit(const MultiNode* node)</text><ellipse cx="1312" cy="1402.8125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1321" y="1406.0229">void visit(const SuchThatNode* node)</text><!--MD5=[1889949f301ae6d76cb20e56f2d1d951]
+class IndexNotationVisitor--><rect codeLine="205" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1589.5" y="1063.5"/><ellipse cx="1668.25" cy="1079.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1671.2188,1085.1406 Q1670.6406,1085.4375 1670,1085.5781 Q1669.3594,1085.7344 1668.6563,1085.7344 Q1666.1563,1085.7344 1664.8281,1084.0938 Q1663.5156,1082.4375 1663.5156,1079.3125 Q1663.5156,1076.1875 1664.8281,1074.5313 Q1666.1563,1072.875 1668.6563,1072.875 Q1669.3594,1072.875 1670,1073.0313 Q1670.6563,1073.1875 1671.2188,1073.4844 L1671.2188,1076.2031 Q1670.5938,1075.625 1670,1075.3594 Q1669.4063,1075.0781 1668.7813,1075.0781 Q1667.4375,1075.0781 1666.75,1076.1563 Q1666.0625,1077.2188 1666.0625,1079.3125 Q1666.0625,1081.4063 1666.75,1082.4844 Q1667.4375,1083.5469 1668.7813,1083.5469 Q1669.4063,1083.5469 1670,1083.2813 Q1670.5938,1083 1671.2188,1082.4219 L1671.2188,1085.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="126" x="1688.75" y="1083.6543">IndexNotationVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1095.5" y2="1095.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1103.5" y2="1103.5"/><ellipse cx="1600.5" cy="1114.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1609.5" y="1117.7104">virtual void visit(const AccessNode* node)</text><ellipse cx="1600.5" cy="1127.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1130.5151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1600.5" cy="1140.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1143.3198">virtual void visit(const NegNode* node)</text><ellipse cx="1600.5" cy="1152.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1609.5" y="1156.1245">virtual void visit(const AddNode* node)</text><ellipse cx="1600.5" cy="1165.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1168.9292">virtual void visit(const SubNode* node)</text><ellipse cx="1600.5" cy="1178.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1609.5" y="1181.7339">virtual void visit(const MulNode* node)</text><ellipse cx="1600.5" cy="1191.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1609.5" y="1194.5386">virtual void visit(const DivNode* node)</text><ellipse cx="1600.5" cy="1204.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1609.5" y="1207.3433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1600.5" cy="1216.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1220.1479">virtual void visit(const CastNode* node)</text><ellipse cx="1600.5" cy="1229.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1609.5" y="1232.9526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1600.5" cy="1242.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1609.5" y="1245.7573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1600.5" cy="1255.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1609.5" y="1258.562">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1600.5" cy="1268.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1609.5" y="1271.3667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1613.5" y="1284.1714"/><ellipse cx="1600.5" cy="1293.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1609.5" y="1296.9761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1600.5" cy="1306.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1609.5" y="1309.7808">virtual void visit(const YieldNode* node)</text><ellipse cx="1600.5" cy="1319.375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1609.5" y="1322.5854">virtual void visit(const ForallNode* node)</text><ellipse cx="1600.5" cy="1332.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1335.3901">virtual void visit(const WhereNode* node)</text><ellipse cx="1600.5" cy="1344.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1348.1948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1600.5" cy="1357.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1360.9995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1600.5" cy="1370.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1373.8042">virtual void visit(const MultiNode* node)</text><ellipse cx="1600.5" cy="1383.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1609.5" y="1386.6089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[c249847c086044a14a4ecd1d09905030]
+class Matcher--><rect codeLine="231" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Matcher" style="stroke:#A80036;stroke-width:1.5;" width="83" x="1694" y="1621"/><ellipse cx="1709" cy="1637" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.9688,1642.6406 Q1711.3906,1642.9375 1710.75,1643.0781 Q1710.1094,1643.2344 1709.4063,1643.2344 Q1706.9063,1643.2344 1705.5781,1641.5938 Q1704.2656,1639.9375 1704.2656,1636.8125 Q1704.2656,1633.6875 1705.5781,1632.0313 Q1706.9063,1630.375 1709.4063,1630.375 Q1710.1094,1630.375 1710.75,1630.5313 Q1711.4063,1630.6875 1711.9688,1630.9844 L1711.9688,1633.7031 Q1711.3438,1633.125 1710.75,1632.8594 Q1710.1563,1632.5781 1709.5313,1632.5781 Q1708.1875,1632.5781 1707.5,1633.6563 Q1706.8125,1634.7188 1706.8125,1636.8125 Q1706.8125,1638.9063 1707.5,1639.9844 Q1708.1875,1641.0469 1709.5313,1641.0469 Q1710.1563,1641.0469 1710.75,1640.7813 Q1711.3438,1640.5 1711.9688,1639.9219 L1711.9688,1642.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="51" x="1723" y="1641.1543">Matcher</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1653" y2="1653"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1661" y2="1661"/><!--MD5=[ea8f53988b378f12e96f95ad2b8e8e7e]
+class IndexExprRewriterStrict--><rect codeLine="235" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="214.4609" id="IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="301" x="1632" y="747"/><ellipse cx="1704.25" cy="763" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1704.3594,758.3438 L1703.2031,763.4219 L1705.5313,763.4219 L1704.3594,758.3438 Z M1702.875,756.1094 L1705.8594,756.1094 L1709.2188,768.5 L1706.7656,768.5 L1706,765.4375 L1702.7188,765.4375 L1701.9688,768.5 L1699.5313,768.5 L1702.875,756.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="148" x="1724.75" y="767.1543">IndexExprRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="779" y2="779"/><polygon fill="none" points="1643,785,1647,789,1643,793,1639,789" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="1652" y="793.2104">IndexExpr expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="799.8047" y2="799.8047"/><ellipse cx="1643" cy="810.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="162" x="1652" y="814.0151">IndexExpr rewrite(IndexExpr)</text><polygon fill="#FFFF44" points="1643,818.6094,1647,822.6094,1643,826.6094,1639,822.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="249" x="1652" y="826.8198">virtual void visit(const AccessNode* op) = 0</text><polygon fill="#FFFF44" points="1643,831.4141,1647,835.4141,1643,839.4141,1639,835.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1652" y="839.6245">virtual void visit(const LiteralNode* op) = 0</text><polygon fill="#FFFF44" points="1643,844.2188,1647,848.2188,1643,852.2188,1639,848.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="852.4292">virtual void visit(const NegNode* op) = 0</text><polygon fill="#FFFF44" points="1643,857.0234,1647,861.0234,1643,865.0234,1639,861.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="233" x="1652" y="865.2339">virtual void visit(const SqrtNode* op) = 0</text><polygon fill="#FFFF44" points="1643,869.8281,1647,873.8281,1643,877.8281,1639,873.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1652" y="878.0386">virtual void visit(const AddNode* op) = 0</text><polygon fill="#FFFF44" points="1643,882.6328,1647,886.6328,1643,890.6328,1639,886.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="890.8433">virtual void visit(const SubNode* op) = 0</text><polygon fill="#FFFF44" points="1643,895.4375,1647,899.4375,1643,903.4375,1639,899.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="228" x="1652" y="903.6479">virtual void visit(const MulNode* op) = 0</text><polygon fill="#FFFF44" points="1643,908.2422,1647,912.2422,1643,916.2422,1639,912.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1652" y="916.4526">virtual void visit(const DivNode* op) = 0</text><polygon fill="#FFFF44" points="1643,921.0469,1647,925.0469,1643,929.0469,1639,925.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1652" y="929.2573">virtual void visit(const CastNode* op) = 0</text><polygon fill="#FFFF44" points="1643,933.8516,1647,937.8516,1643,941.8516,1639,937.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="275" x="1652" y="942.062">virtual void visit(const CallIntrinsicNode* op) = 0</text><polygon fill="#FFFF44" points="1643,946.6563,1647,950.6563,1643,954.6563,1639,950.6563" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="264" x="1652" y="954.8667">virtual void visit(const ReductionNode* op) = 0</text><!--MD5=[fce5a5c177cad31ce6c931f148bb8f55]
+class IndexStmtRewriterStrict--><rect codeLine="252" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="176.0469" id="IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="302" x="1968.5" y="766"/><ellipse cx="2040.75" cy="782" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2040.8594,777.3438 L2039.7031,782.4219 L2042.0313,782.4219 L2040.8594,777.3438 Z M2039.375,775.1094 L2042.3594,775.1094 L2045.7188,787.5 L2043.2656,787.5 L2042.5,784.4375 L2039.2188,784.4375 L2038.4688,787.5 L2036.0313,787.5 L2039.375,775.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="149" x="2061.25" y="786.1543">IndexStmtRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="798" y2="798"/><polygon fill="none" points="1979.5,804,1983.5,808,1979.5,812,1975.5,808" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="87" x="1988.5" y="812.2104">IndexStmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="818.8047" y2="818.8047"/><ellipse cx="1979.5" cy="829.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="1988.5" y="833.0151">IndexStmt rewrite(IndexStmt)</text><polygon fill="#FFFF44" points="1979.5,837.6094,1983.5,841.6094,1979.5,845.6094,1975.5,841.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="1988.5" y="845.8198">virtual void visit(const AssignmentNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,850.4141,1983.5,854.4141,1979.5,858.4141,1975.5,854.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="1988.5" y="858.6245">virtual void visit(const YieldNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,863.2188,1983.5,867.2188,1979.5,871.2188,1975.5,867.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="240" x="1988.5" y="871.4292">virtual void visit(const ForallNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,876.0234,1983.5,880.0234,1979.5,884.0234,1975.5,880.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="884.2339">virtual void visit(const WhereNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,888.8281,1983.5,892.8281,1979.5,896.8281,1975.5,892.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="897.0386">virtual void visit(const SequenceNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,901.6328,1983.5,905.6328,1979.5,909.6328,1975.5,905.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="909.8433">virtual void visit(const AssembleNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,914.4375,1983.5,918.4375,1979.5,922.4375,1975.5,918.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1988.5" y="922.6479">virtual void visit(const MultiNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,927.2422,1983.5,931.2422,1979.5,935.2422,1975.5,931.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="1988.5" y="935.4526">virtual void visit(const SuchThatNode* op) = 0</text><!--MD5=[c34474f968cd689ed26c36a6e449f9a5]
+class IndexNotationRewriterStrict--><rect codeLine="266" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="206" x="1966.5" y="1204.5"/><ellipse cx="1981.5" cy="1220.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1981.6094,1215.8438 L1980.4531,1220.9219 L1982.7813,1220.9219 L1981.6094,1215.8438 Z M1980.125,1213.6094 L1983.1094,1213.6094 L1986.4688,1226 L1984.0156,1226 L1983.25,1222.9375 L1979.9688,1222.9375 L1979.2188,1226 L1976.7813,1226 L1980.125,1213.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="174" x="1995.5" y="1224.6543">IndexNotationRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1236.5" y2="1236.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1244.5" y2="1244.5"/><!--MD5=[f43b50a501af9b122d481161df5564ac]
+class IndexNotationRewriter--><rect codeLine="267" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1923.5" y="1480"/><ellipse cx="1996.25" cy="1496" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1999.2188,1501.6406 Q1998.6406,1501.9375 1998,1502.0781 Q1997.3594,1502.2344 1996.6563,1502.2344 Q1994.1563,1502.2344 1992.8281,1500.5938 Q1991.5156,1498.9375 1991.5156,1495.8125 Q1991.5156,1492.6875 1992.8281,1491.0313 Q1994.1563,1489.375 1996.6563,1489.375 Q1997.3594,1489.375 1998,1489.5313 Q1998.6563,1489.6875 1999.2188,1489.9844 L1999.2188,1492.7031 Q1998.5938,1492.125 1998,1491.8594 Q1997.4063,1491.5781 1996.7813,1491.5781 Q1995.4375,1491.5781 1994.75,1492.6563 Q1994.0625,1493.7188 1994.0625,1495.8125 Q1994.0625,1497.9063 1994.75,1498.9844 Q1995.4375,1500.0469 1996.7813,1500.0469 Q1997.4063,1500.0469 1998,1499.7813 Q1998.5938,1499.5 1999.2188,1498.9219 L1999.2188,1501.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="2016.75" y="1500.1543">IndexNotationRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1512" y2="1512"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1520" y2="1520"/><ellipse cx="1934.5" cy="1531" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1943.5" y="1534.2104">virtual void visit(const AccessNode* node)</text><ellipse cx="1934.5" cy="1543.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1547.0151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1934.5" cy="1556.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1559.8198">virtual void visit(const NegNode* node)</text><ellipse cx="1934.5" cy="1569.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1943.5" y="1572.6245">virtual void visit(const AddNode* node)</text><ellipse cx="1934.5" cy="1582.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1585.4292">virtual void visit(const SubNode* node)</text><ellipse cx="1934.5" cy="1595.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1943.5" y="1598.2339">virtual void visit(const MulNode* node)</text><ellipse cx="1934.5" cy="1607.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1943.5" y="1611.0386">virtual void visit(const DivNode* node)</text><ellipse cx="1934.5" cy="1620.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1943.5" y="1623.8433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1934.5" cy="1633.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1636.6479">virtual void visit(const CastNode* node)</text><ellipse cx="1934.5" cy="1646.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1943.5" y="1649.4526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1934.5" cy="1659.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1943.5" y="1662.2573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1934.5" cy="1671.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1943.5" y="1675.062">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1934.5" cy="1684.6563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1943.5" y="1687.8667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1947.5" y="1700.6714"/><ellipse cx="1934.5" cy="1710.2656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1943.5" y="1713.4761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1934.5" cy="1723.0703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1943.5" y="1726.2808">virtual void visit(const YieldNode* node)</text><ellipse cx="1934.5" cy="1735.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1943.5" y="1739.0854">virtual void visit(const ForallNode* node)</text><ellipse cx="1934.5" cy="1748.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1751.8901">virtual void visit(const WhereNode* node)</text><ellipse cx="1934.5" cy="1761.4844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1764.6948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1934.5" cy="1774.2891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1777.4995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1934.5" cy="1787.0938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1790.3042">virtual void visit(const MultiNode* node)</text><ellipse cx="1934.5" cy="1799.8984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1943.5" y="1803.1089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[2bd6b9bd378d282739bad95694e0395c]
+class Lowerer--><rect codeLine="317" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="Lowerer" style="stroke:#A80036;stroke-width:1.5;" width="234" x="946.5" y="237.5"/><ellipse cx="1034.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1037.7188,259.1406 Q1037.1406,259.4375 1036.5,259.5781 Q1035.8594,259.7344 1035.1563,259.7344 Q1032.6563,259.7344 1031.3281,258.0938 Q1030.0156,256.4375 1030.0156,253.3125 Q1030.0156,250.1875 1031.3281,248.5313 Q1032.6563,246.875 1035.1563,246.875 Q1035.8594,246.875 1036.5,247.0313 Q1037.1563,247.1875 1037.7188,247.4844 L1037.7188,250.2031 Q1037.0938,249.625 1036.5,249.3594 Q1035.9063,249.0781 1035.2813,249.0781 Q1033.9375,249.0781 1033.25,250.1563 Q1032.5625,251.2188 1032.5625,253.3125 Q1032.5625,255.4063 1033.25,256.4844 Q1033.9375,257.5469 1035.2813,257.5469 Q1035.9063,257.5469 1036.5,257.2813 Q1037.0938,257 1037.7188,256.4219 L1037.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="49" x="1055.25" y="257.6543">Lowerer</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="269.5" y2="269.5"/><ellipse cx="957.5" cy="280.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="966.5" y="283.7104">std::shared_ptr&lt;LowererImpl&gt; impl;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="290.3047" y2="290.3047"/><!--MD5=[b7b8bc7e8eb8ee18eadc3b8fd556bfb2]
+class LowererImpl--><rect codeLine="320" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImpl" style="stroke:#A80036;stroke-width:1.5;" width="365" x="881" y="490.5"/><ellipse cx="1020.75" cy="506.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1020.8594,501.8438 L1019.7031,506.9219 L1022.0313,506.9219 L1020.8594,501.8438 Z M1019.375,499.6094 L1022.3594,499.6094 L1025.7188,512 L1023.2656,512 L1022.5,508.9375 L1019.2188,508.9375 L1018.4688,512 L1016.0313,512 L1019.375,499.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="77" x="1041.25" y="510.6543">LowererImpl</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="522.5" y2="522.5"/><polygon fill="none" points="892,528.5,896,532.5,892,536.5,888,532.5" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="74" x="901" y="536.7104">class Visitor;</text><polygon fill="none" points="892,541.3047,896,545.3047,892,549.3047,888,545.3047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="111" x="901" y="549.5151">friend class Visitor;</text><polygon fill="none" points="892,554.1094,896,558.1094,892,562.1094,888,558.1094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="185" x="901" y="562.3198">std::shared_ptr&lt;Visitor&gt; visitor;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="568.9141" y2="568.9141"/><polygon fill="#FFFF44" points="892,574.9141,896,578.9141,892,582.9141,888,578.9141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="901" y="583.1245">virtual ir::Stmt lower(IndexStmt stmt);</text><polygon fill="#FFFF44" points="892,587.7188,896,591.7188,892,595.7188,888,591.7188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="211" x="901" y="595.9292">virtual ir::Expr lower(IndexExpr expr);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="608.7339"/><polygon fill="#FFFF44" points="892,613.3281,896,617.3281,892,621.3281,888,617.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="901" y="621.5386">virtual ir::Expr lowerExpr(IndexExpr expr) = 0;</text><polygon fill="#FFFF44" points="892,626.1328,896,630.1328,892,634.1328,888,630.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="267" x="901" y="634.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="647.1479"/><ellipse cx="892" cy="656.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="315" x="901" y="659.9526">virtual ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="339" x="901" y="672.7573">bool assemble, bool compute, bool pack, bool unpack) = 0;</text><!--MD5=[cf3b4bcfbe7bc4015089b336f3e5ed76]
+class LowererImplImperative--><rect codeLine="337" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImplImperative" style="stroke:#A80036;stroke-width:1.5;" width="337" x="691" y="759.5"/><ellipse cx="785.75" cy="775.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M788.7188,781.1406 Q788.1406,781.4375 787.5,781.5781 Q786.8594,781.7344 786.1563,781.7344 Q783.6563,781.7344 782.3281,780.0938 Q781.0156,778.4375 781.0156,775.3125 Q781.0156,772.1875 782.3281,770.5313 Q783.6563,768.875 786.1563,768.875 Q786.8594,768.875 787.5,769.0313 Q788.1563,769.1875 788.7188,769.4844 L788.7188,772.2031 Q788.0938,771.625 787.5,771.3594 Q786.9063,771.0781 786.2813,771.0781 Q784.9375,771.0781 784.25,772.1563 Q783.5625,773.2188 783.5625,775.3125 Q783.5625,777.4063 784.25,778.4844 Q784.9375,779.5469 786.2813,779.5469 Q786.9063,779.5469 787.5,779.2813 Q788.0938,779 788.7188,778.4219 L788.7188,781.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="139" x="806.25" y="779.6543">LowererImplImperative</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="791.5" y2="791.5"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="799.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="70" x="711" y="805.7104">class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="812.3047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="711" y="818.5151">fiend class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="825.1094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="711" y="831.3198">std::shared_ptr&lt;Visitor&gt; visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="837.9141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="711" y="844.1245">bool assemble</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="850.7188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="77" x="711" y="856.9292">bool compute</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="863.5234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="711" y="869.7339">vars a_bunch_of_other_fields</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="876.3281" y2="876.3281"/><polygon fill="#FFFF44" points="702,882.3281,706,886.3281,702,890.3281,698,886.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="238" x="711" y="890.5386">virtual ir::Stmt lowerExpr(IndexExpr expr);</text><polygon fill="#FFFF44" points="702,895.1328,706,899.1328,702,903.1328,698,899.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="243" x="711" y="903.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="715" y="916.1479"/><ellipse cx="702" cy="925.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="711" y="928.9526">ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="311" x="711" y="941.7573">bool assemble, bool compute, bool pack, bool unpack)</text><path d="M581,1201 L581,1256.3984 A0,0 0 0 0 581,1256.3984 L946,1256.3984 A0,0 0 0 0 946,1256.3984 L946,1211 L936,1201 L774.6103,1201 L835.2751,948.5022 L766.6103,1201 L581,1201 A0,0 0 0 0 581,1201 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M936,1201 L936,1211 L946,1211 L936,1201 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="344" x="587" y="1218.0669">Stmt LowererImplImperative::lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="174" x="595" y="1233.1997">return visitor-&gt;lower(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="587" y="1248.3325">}</text><!--MD5=[53bf68ed638bcf4718423098b3d480ea]
+class Visitor--><rect codeLine="362" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="380.9219" id="Visitor" style="stroke:#A80036;stroke-width:1.5;" width="253" x="981" y="1038"/><ellipse cx="1083.75" cy="1054" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1086.7188,1059.6406 Q1086.1406,1059.9375 1085.5,1060.0781 Q1084.8594,1060.2344 1084.1563,1060.2344 Q1081.6563,1060.2344 1080.3281,1058.5938 Q1079.0156,1056.9375 1079.0156,1053.8125 Q1079.0156,1050.6875 1080.3281,1049.0313 Q1081.6563,1047.375 1084.1563,1047.375 Q1084.8594,1047.375 1085.5,1047.5313 Q1086.1563,1047.6875 1086.7188,1047.9844 L1086.7188,1050.7031 Q1086.0938,1050.125 1085.5,1049.8594 Q1084.9063,1049.5781 1084.2813,1049.5781 Q1082.9375,1049.5781 1082.25,1050.6563 Q1081.5625,1051.7188 1081.5625,1053.8125 Q1081.5625,1055.9063 1082.25,1056.9844 Q1082.9375,1058.0469 1084.2813,1058.0469 Q1084.9063,1058.0469 1085.5,1057.7813 Q1086.0938,1057.5 1086.7188,1056.9219 L1086.7188,1059.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="39" x="1104.25" y="1058.1543">Visitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1070" y2="1070"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1078"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="104" x="1001" y="1084.2104">LowererImpl* impl</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1090.8047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="1001" y="1097.0151">Expr expr</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1103.6094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="1001" y="1109.8198">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1116.4141" y2="1116.4141"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1124.4141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1001" y="1130.6245">void visit(const AssignmentNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1137.2188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1001" y="1143.4292">void visit(const YieldNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1150.0234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1001" y="1156.2339">void visit(const ForallNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1162.8281"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1169.0386">void visit(const WhereNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1175.6328"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1181.8433">void visit(const MultiNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1188.4375"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1001" y="1194.6479">void visit(const SuchThatNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1201.2422"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1207.4526">void visit(const SequenceNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1214.0469"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1220.2573">void visit(const AssembleNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1226.8516"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1001" y="1233.062">void visit(const AccessNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1239.6563"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1245.8667">void visit(const LiteralNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1252.4609"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1258.6714">void visit(const NegNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1265.2656"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1001" y="1271.4761">void visit(const AddNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1278.0703"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1284.2808">void visit(const SubNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1290.875"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1001" y="1297.0854">void visit(const MulNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1303.6797"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1001" y="1309.8901">void visit(const DivNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1316.4844"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1001" y="1322.6948">void visit(const SqrtNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1329.2891"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1335.4995">void visit(const CastNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1342.0938"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1001" y="1348.3042">void visit(const CallIntrinsicNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1354.8984"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1001" y="1361.1089">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1005" y="1373.9136"/><ellipse cx="992" cy="1383.5078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1001" y="1386.7183">Visitor(LowererImplImperative* impl)</text><ellipse cx="992" cy="1396.3125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="157" x="1001" y="1399.5229">Stmt lower(IndexStmt stmt)</text><ellipse cx="992" cy="1409.1172" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="152" x="1001" y="1412.3276">Expr lower(IndexExpr expr)</text><path d="M975.5,1587 L975.5,1702.9297 A0,0 0 0 0 975.5,1702.9297 L1239.5,1702.9297 A0,0 0 0 0 1239.5,1702.9297 L1239.5,1597 L1229.5,1587 L1111.5,1587 L1107.5,1419.0758 L1103.5,1587 L975.5,1587 A0,0 0 0 0 975.5,1587 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1229.5,1587 L1229.5,1597 L1239.5,1597 L1229.5,1587 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="191" x="981.5" y="1604.0669">Stmt lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="134" x="989.5" y="1619.1997">this-&gt;stmt = Stmt();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="219" x="989.5" y="1634.3325">impl-&gt;accessibleIterators.scope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="217" x="989.5" y="1649.4653">IndexStmtVisitorStrict::visit(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="235" x="989.5" y="1664.5981">impl-&gt;accessibleIterators.unscope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="116" x="989.5" y="1679.731">return this-&gt;stmt;</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="981.5" y="1694.8638">}</text><!--MD5=[ae51cb2269b8d9d23f4eb16ba4c021c2]
+reverse link Uncopyable to IRNode--><path codeLine="26" d="M820.4411,310.2452 C805.2002,341.1699 780.565,381.9466 747.5,407 C656.8632,475.6756 606.7888,436.4312 503.5,484 C464.9666,501.7462 424.4666,526.9962 393.5999,547.8097 " fill="none" id="Uncopyable-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="814.1208,307.2354,828.901,292.0517,826.8155,313.1384,814.1208,307.2354" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[910d35d02fd37b100f27db676215561a]
+reverse link IRNode to BaseStmtNode--><path codeLine="27" d="M307.5211,639.0848 C270.9373,697.3157 214.3898,787.3234 187.5844,829.9899 " fill="none" id="IRNode-backto-BaseStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="301.6118,635.3322,318.1787,622.1209,313.4664,642.7799,301.6118,635.3322" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d15399140a2ba01317e3af6505b2f237]
+reverse link IRNode to BaseExprNode--><path codeLine="28" d="M348.9536,642.2864 C356.1642,697.705 366.8158,779.5704 372.5311,823.496 " fill="none" id="IRNode-backto-BaseExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="341.9689,642.8569,346.3299,622.1209,355.8519,641.0505,341.9689,642.8569" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6fc58d354eb039aa71a812145e71cc51]
+reverse link BaseStmtNode to StmtNode--><path codeLine="29" d="M167.1367,898.6343 C157.8757,975.7069 139.117,1131.8206 131.1797,1197.8764 " fill="none" id="BaseStmtNode-backto-StmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="160.246,897.3055,169.5821,878.2834,174.146,898.9757,160.246,897.3055" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[b34aab7e28a3dad0196efbcf2402ad4b]
+reverse link BaseExprNode to ExprNode--><path codeLine="30" d="M377.9886,904.6786 C380.3078,983.6383 384.7133,1133.6261 386.6028,1197.9549 " fill="none" id="BaseExprNode-backto-ExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="370.9905,904.8438,377.4002,884.6469,384.9845,904.4327,370.9905,904.8438" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6982bf9bb66925c6ba3afaae707aa75e]
+reverse link IntrusivePtr to IRHandle--><path codeLine="38" d="M613.7823,69.7043 C517.4991,114.1798 345.0472,193.8394 250.6748,237.4323 " fill="none" id="IntrusivePtr-backto-IRHandle" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="610.8857,63.3316,631.9777,61.2994,616.7566,76.0412,610.8857,63.3316" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[59037bcbfab734c29d5f29f29345990a]
+reverse link IRHandle to Expr--><path codeLine="39" d="M161.5974,316.9128 C141.1096,360.6844 110.2697,426.612 83.5,484 C71.2362,510.2908 57.2078,540.4477 47.6932,560.9138 " fill="none" id="IRHandle-backto-Expr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="155.2791,313.8992,170.0982,298.7534,167.9586,319.8347,155.2791,313.8992" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2e3a655b6bb23e034db67107221f412a]
+reverse link IRHandle to Stmt--><path codeLine="40" d="M176.0078,318.793 C164.419,388.1073 144.1107,509.5737 135.5197,560.9576 " fill="none" id="IRHandle-backto-Stmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="169.1677,317.2552,179.37,298.6833,182.976,319.5639,169.1677,317.2552" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2a0bead0c725d8d45864c262e97ce783]
+reverse link IRHandle to IRNode--><path codeLine="42" d="M205.5204,310.4425 C236.4803,372.9538 293.8347,488.7585 323.1741,547.998 " fill="none" id="IRHandle-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="199.6965,298.6833,198.7749,305.8353,205.0223,309.4367,205.9439,302.2848,199.6965,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="273.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="193.9008" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="311.0578" y="536.9081">1</text><!--MD5=[58cf07e15c029e621d8edfba03fa64a2]
+reverse link IRVisitorStrict to IRVisitor--><path codeLine="94" d="M2743.085,79.3089 C2719.9753,94.626 2693.8639,112.3013 2670.5,129 C2620.0401,165.0649 2563.4947,208.7917 2527.0182,237.4501 " fill="none" id="IRVisitorStrict-backto-IRVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2739.5252,73.2715,2760.0768,68.1107,2747.2291,84.9613,2739.5252,73.2715" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[75bfe22e38cc091da0fbb6f74406d06e]
+reverse link IRVisitorStrict to IRPrinter--><path codeLine="95" d="M2880.6802,78.0481 C2916.73,98.0228 2962.2732,123.2577 3008.2876,148.7537 " fill="none" id="IRVisitorStrict-backto-IRPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2876.843,83.9248,2862.7416,68.1086,2883.6283,71.679,2876.843,83.9248" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[85c48ab67ed60544567d89d58f94870d]
+reverse link IRVisitorStrict to IRRewriter--><path codeLine="96" d="M2807.5,88.3035 C2807.5,120.7899 2807.5,163.3447 2807.5,198.8875 " fill="none" id="IRVisitorStrict-backto-IRRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2800.5001,88.1087,2807.5,68.1086,2814.5001,88.1086,2800.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4028845cddf1255230d8af57eef922a1]
+reverse link IRVisitor to IRVerifier--><path codeLine="97" d="M2464.2278,317.0083 C2430.0567,386.0033 2369.0761,509.1298 2343.4074,560.9576 " fill="none" id="IRVisitor-backto-IRVerifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2458.1543,313.4989,2473.3035,298.6833,2470.6999,319.7124,2458.1543,313.4989" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a94ddfc410010d5e6723affae8cc10d]
+reverse link IRRewriter to ExpressionSimplifier--><path codeLine="102" d="M2720.1218,351.0244 C2678.6358,390.8516 2628.5995,439.4826 2584.5,484 C2558.962,509.78 2530.3923,540.0247 2511.1115,560.6511 " fill="none" id="IRRewriter-backto-ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2715.4205,345.8343,2734.7035,337.0499,2725.1073,355.9421,2715.4205,345.8343" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[949c6b7bfd4235ac14da91ff0f1abad4]
+reverse link IRRewriter to RemoveRedundantStatements--><path codeLine="109" d="M2781.238,356.5644 C2760.6434,426.0163 2733.4041,517.8765 2720.6551,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2774.5903,354.3599,2786.9875,337.1753,2788.0127,358.3401,2774.5903,354.3599" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[429f4895fdac785b3cbc97ff72ee188d]
+reverse link IRRewriter to RemoveRedundantLoops--><path codeLine="110" d="M2847.8795,355.6733 C2879.9145,425.2286 2922.5003,517.6924 2942.3867,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2841.3685,358.2695,2839.3599,337.1753,2854.0847,352.4128,2841.3685,358.2695" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[68110a92c5ea6b91d77066a3850c99de]
+reverse link IRRewriter to RemoveDuplicateBody--><path codeLine="111" d="M2908.6453,349.9792 C2956.6158,389.6232 3014.286,438.4014 3064.5,484 C3092.4965,509.4232 3123.3012,539.9945 3143.8609,560.7956 " fill="none" id="IRRewriter-backto-RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2904.088,355.2943,2893.1039,337.1739,2912.9906,344.4895,2904.088,355.2943" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ae50f65b4cbe15dbcd3dbdd752b04bad]
+reverse link IRPrinter to CodeGen--><path codeLine="120" d="M3277.8266,425.9959 C3295.9202,478.6165 3314.0392,531.3112 3324.2066,560.8807 " fill="none" id="IRPrinter-backto-CodeGen" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3271.1927,428.2301,3271.309,407.0408,3284.4319,423.6778,3271.1927,428.2301" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[319cfb8dc735ef16cddbcdc70ff637f3]
+reverse link CodeGen to CodeGen_C--><path codeLine="121" d="M3308.9871,626.8873 C3276.9808,683.9051 3220.7457,784.0854 3195.0269,829.9024 " fill="none" id="CodeGen-backto-CodeGen_C" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3302.9395,623.3601,3318.8334,609.3464,3315.1476,630.213,3302.9395,623.3601" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[aa96aa54faae34d6ca9ace804ab720b5]
+reverse link CodeGen to CodeGen_ISPC--><path codeLine="122" d="M3357.3479,626.516 C3391.4351,683.4693 3451.5956,783.9858 3479.0773,829.9024 " fill="none" id="CodeGen-backto-CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3351.3364,630.1024,3347.0716,609.3464,3363.3492,622.9126,3351.3364,630.1024" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[442ed43531516b32839cb3faf9b2f28c]
+reverse link CodeGen to CodeGen_CUDA--><path codeLine="123" d="M3332.5,629.5199 C3332.5,686.9415 3332.5,784.7827 3332.5,829.9024 " fill="none" id="CodeGen-backto-CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3325.5001,629.3464,3332.5,609.3464,3339.5001,629.3464,3325.5001,629.3464" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[e163d187ef5eaf663efea2335f5ab426]
+reverse link Manageable to IndexStmtNode--><path codeLine="135" d="M1257.9638,310.6351 C1244.4457,339.9959 1223.6542,378.6672 1197.5,407 C1172.0822,434.5351 1162.6727,441.0723 1127.5,454 C1016.6611,494.7386 975.8786,447.724 863.5,484 C813.372,500.1814 761.462,531.0819 726.345,554.4621 " fill="none" id="Manageable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1251.6817,307.531,1266.1369,292.0374,1264.4987,313.1636,1251.6817,307.531" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d7ac6dc268d898e1c32d87af860f66f6]
+reverse link Uncopyable to IndexStmtNode--><path codeLine="136" d="M817.879,310.1732 C786.2624,374.8344 726.5439,496.9683 698.6147,554.088 " fill="none" id="Uncopyable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="811.6488,306.9791,826.7226,292.0867,824.2258,313.1288,811.6488,306.9791" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[648f85c83359671aa27d8bdab5afe684]
+reverse link Manageable to IndexExprNode--><path codeLine="137" d="M1297.8505,310.1732 C1332.1189,374.8344 1396.846,496.9683 1427.1176,554.088 " fill="none" id="Manageable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1291.4456,313.0363,1288.2652,292.0867,1303.8158,306.4805,1291.4456,313.0363" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81b6c5025c45f10b02aba636448b0629]
+reverse link Uncopyable to IndexExprNode--><path codeLine="138" d="M854.8121,310.9496 C869.2284,342.7749 893.4553,384.3933 928.5,407 C1003.6621,455.4857 1040.645,415.6362 1127.5,437 C1189.6011,452.2751 1205.1592,457.804 1263.5,484 C1309.4283,504.6226 1359.1614,533.0311 1394.6887,554.4675 " fill="none" id="Uncopyable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="848.2286,313.3522,846.9174,292.2032,861.1311,307.9185,848.2286,313.3522" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a1ee9a433488db7e06ed4f91810d452]
+reverse link IntrusivePtr to IndexStmt--><path codeLine="143" d="M683.5,88.2338 C683.5,136.5801 683.5,207.2721 683.5,243.9383 " fill="none" id="IntrusivePtr-backto-IndexStmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="676.5001,88.1087,683.5,68.1086,690.5001,88.1086,676.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[be16824624a74e7da7bb67b6f377f820]
+reverse link IndexStmt to IndexStmtNode--><path codeLine="144" d="M683.5,305.2739 C683.5,368.4736 683.5,495.4911 683.5,554.088 " fill="none" id="IndexStmt-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="683.5,292.0867,679.5,298.0867,683.5,304.0867,687.5,298.0867,683.5,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.475" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.675" y="543.2663">1</text><!--MD5=[8120730d6b32269f0970ddfe15f91d14]
+reverse link IntrusivePtr to IndexExpr--><path codeLine="145" d="M755.3761,39.2555 C906.9425,44.2375 1253.1256,62.7483 1347.5,129 C1386.7641,156.5638 1406.5439,211.9266 1415.072,243.8366 " fill="none" id="IntrusivePtr-backto-IndexExpr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="755.1028,46.2505,735.3276,38.6387,755.5334,32.2571,755.1028,46.2505" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[1972b08ae664b2d2310d03537cd7a5e1]
+reverse link IndexExpr to IndexExprNode--><path codeLine="146" d="M1423.2044,305.2739 C1427.7899,368.4736 1437.0057,495.4911 1441.2572,554.088 " fill="none" id="IndexExpr-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1422.2476,292.0867,1418.6923,298.3605,1423.1161,304.0552,1426.6714,297.7815,1422.2476,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1413.6704" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1431.9696" y="543.2663">1</text><!--MD5=[b26c9c8d3a5b29d8271f45d68507eadd]
+reverse link IndexExprVisitorStrict to IndexNotationVisitorStrict--><path codeLine="295" d="M1632.9995,699.6227 C1626.9972,705.0539 1621.1388,710.5255 1615.5,716 C1577.5728,752.8222 1539.4548,801.3814 1518.0255,829.9928 " fill="none" id="IndexExprVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1628.5467,694.2161,1648.1726,686.2269,1637.8124,704.7112,1628.5467,694.2161" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[7234130c38761532599f2f7b36911e2f]
+reverse link IndexStmtVisitorStrict to IndexNotationVisitorStrict--><path codeLine="296" d="M1966.5197,675.467 C1958.1897,679.3054 1949.8174,682.8546 1941.5,686 C1803.2176,738.2949 1746.3714,680.1665 1614.5,747 C1575.0345,767.0014 1539.609,804.9113 1519.0786,829.8154 " fill="none" id="IndexStmtVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1963.6036,669.1012,1984.6506,666.6464,1969.7282,681.6905,1963.6036,669.1012" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d405f4886b031ffa84d6c62850f61924]
+reverse link IndexNotationVisitorStrict to IndexNotationVisitor--><path codeLine="297" d="M1526.3846,895.2501 C1552.042,936.1383 1593.0382,1001.4705 1631.9613,1063.499 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1520.4391,898.945,1515.7379,878.2834,1532.2978,891.5037,1520.4391,898.945" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[9298265fb9c2b46a51654cec95663d88]
+reverse link IndexNotationVisitorStrict to IndexNotationPrinter--><path codeLine="298" d="M1491.8998,898.1203 C1484.7524,934.7873 1474.0689,989.595 1463.4478,1044.0828 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1485.0692,896.5747,1495.7665,878.2834,1498.8106,899.2533,1485.0692,896.5747" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2164c356666f1a365c7584220eeab5ce]
+reverse link IndexNotationVisitor to Matcher--><path codeLine="299" d="M1735.5,1413.8401 C1735.5,1494.6098 1735.5,1580.0329 1735.5,1620.7139 " fill="none" id="IndexNotationVisitor-backto-Matcher" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1728.5001,1413.7101,1735.5,1393.71,1742.5001,1413.71,1728.5001,1413.7101" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81658007f5a451634c394e4129ce2328]
+reverse link IndexExprVisitorStrict to IndexExprRewriterStrict--><path codeLine="301" d="M1782.5,706.5527 C1782.5,720.0234 1782.5,733.669 1782.5,746.9421 " fill="none" id="IndexExprVisitorStrict-backto-IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1775.5001,706.3141,1782.5,686.3141,1789.5001,706.314,1775.5001,706.3141" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ad82d38a65963623a4dbc072e2395c0a]
+reverse link IndexStmtVisitorStrict to IndexStmtRewriterStrict--><path codeLine="302" d="M2113.9155,687.085 C2114.78,712.9253 2115.7014,740.463 2116.5412,765.5657 " fill="none" id="IndexStmtVisitorStrict-backto-IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2106.9097,687.0274,2113.237,666.8045,2120.9019,686.5592,2106.9097,687.0274" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[17cef803f955afc58233a06ff8ed6ced]
+reverse link IndexExprRewriterStrict to IndexNotationRewriterStrict--><path codeLine="303" d="M1876.8558,977.1228 C1939.4095,1058.7478 2016.1337,1158.8635 2050.856,1204.1719 " fill="none" id="IndexExprRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1871.16,981.1985,1864.5506,961.066,1882.2722,972.6826,1871.16,981.1985" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f1a4c69017cc3acf68d02aa5998e72c7]
+reverse link IndexStmtRewriterStrict to IndexNotationRewriterStrict--><path codeLine="304" d="M2105.0534,962.205 C2093.838,1046.2084 2079.1518,1156.2078 2072.7352,1204.2683 " fill="none" id="IndexStmtRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2098.1288,961.1748,2107.714,942.277,2112.0056,963.0275,2098.1288,961.1748" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ea60607216d1741e9a004dc3b2ad9bc4]
+reverse link IndexNotationRewriterStrict to IndexNotationRewriter--><path codeLine="306" d="M2069.5,1272.8869 C2069.5,1322.2639 2069.5,1404.9692 2069.5,1479.8852 " fill="none" id="IndexNotationRewriterStrict-backto-IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2062.5001,1272.6931,2069.5,1252.6931,2076.5001,1272.693,2062.5001,1272.6931" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ce0e28a833df6d388c2232cca949e33a]
+reverse link Uncopyable to LowererImpl--><path codeLine="357" d="M864.1964,309.4349 C881.7851,337.3534 905.9227,374.8161 928.5,407 C947.767,434.4652 969.4267,463.6921 989.6625,490.3547 " fill="none" id="Uncopyable-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="858.0963,312.8828,853.4161,292.2165,869.9625,305.4535,858.0963,312.8828" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[595f18298affe1361dad6c88d07b3ae8]
+reverse link Lowerer to LowererImpl--><path codeLine="358" d="M1063.5,311.7072 C1063.5,357.6007 1063.5,431.1895 1063.5,490.4492 " fill="none" id="Lowerer-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1063.5,298.6833,1059.5,304.6833,1063.5,310.6833,1067.5,304.6833,1063.5,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1064.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1054.7125" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1055.1813" y="479.3951">1</text><!--MD5=[76c844881f8770258bad5028aba6ca47]
+reverse link IndexNotationVisitorStrict to Visitor--><path codeLine="396" d="M1453.9953,890.688 C1409.0705,926.7585 1339.7531,984.0918 1283.5,1038 C1267.164,1053.655 1250.4599,1070.5041 1234.1618,1087.4753 " fill="none" id="IndexNotationVisitorStrict-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1449.6605,885.1915,1469.657,878.1815,1458.3965,896.1315,1449.6605,885.1915" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f3857c0b64c12f6416059a5dcd8ca3ae]
+reverse link LowererImpl to Visitor--><path codeLine="397" d="M1070.9927,694.5811 C1077.4133,788.4824 1086.8084,925.8861 1094.4598,1037.7864 " fill="none" id="LowererImpl-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="1070.5145" cy="687.5863" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1071.0602" x2="1069.9687" y1="695.5676" y2="679.6049"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1062.5331" x2="1078.4958" y1="688.132" y2="687.0405"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1090.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1062.1577" y="699.5">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1085.7942" y="1027.1147">1</text><!--MD5=[44db9126e684c102525c4f7b853b119b]
+reverse link Visitor to LowererImpl--><path codeLine="398" d="M1157.8948,1024.7712 C1171.4272,938.063 1176.6901,836.4934 1153.5,747 C1147.5682,724.1083 1137.6236,701.0075 1126.4596,679.7046 " fill="none" id="Visitor-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1155.7945,1037.728,1160.703,1032.4454,1157.7146,1025.8826,1152.8061,1031.1653,1155.7945,1037.728" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1170.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1149.5248" y="1027.054">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1122.7894" y="699.2875">1</text><!--MD5=[7cec337d4232ea69c4a4e115b7f1c391]
+reverse link LowererImpl to LowererImplImperative--><path codeLine="400" d="M979.2866,696.0461 C963.4034,716.9902 946.9263,738.7172 931.4609,759.1104 " fill="none" id="LowererImpl-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="973.9145,691.5453,991.5772,679.8393,985.0696,700.005,973.9145,691.5453" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d416585c3fdacb879af8752baa2327bb]
+reverse link LowererImplImperative to Visitor--><path codeLine="401" d="M879.1422,963.0891 C885.3942,979.2794 893.6358,994.8993 904.5,1008 C923.2784,1030.6441 940.9429,1019.1172 963.5,1038 C969.3812,1042.9232 975.1644,1048.1297 980.833,1053.5529 " fill="none" id="LowererImplImperative-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="876.7795" cy="956.3169" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="879.4147" x2="874.1442" y1="963.8704" y2="948.7634"/><line style="stroke:#A80036;stroke-width:1.0;" x1="869.226" x2="884.333" y1="958.9522" y2="953.6816"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="905.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="866.9259" y="968.3506">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="964.8705" y="1039.5084">1</text><!--MD5=[cd8dd7ca9f18b6220f591f64794d3d39]
+reverse link Visitor to LowererImplImperative--><path codeLine="402" d="M988.0443,1026.5409 C980.2164,1014.4253 972.331,1002.497 964.5,991 C955.0529,977.1302 944.7145,962.7728 934.3268,948.8048 " fill="none" id="Visitor-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="995.2137,1037.7132,995.3397,1030.5032,988.7328,1027.6138,988.6068,1034.8238,995.2137,1037.7132" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="974.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="982.6713" y="1027.0386">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="931.5281" y="968.395">1</text><!--MD5=[59ff6f047f3ce21caa7eb37a22acd23c]
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part - - convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+
+
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+class Visitor {
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+PlantUML version 1.2021.7(Sun May 23 08:40:07 EDT 2021)
+(GPL source distribution)
+Java Runtime: OpenJDK Runtime Environment
+JVM: OpenJDK 64-Bit Server VM
+Default Encoding: ANSI_X3.4-1968
+Language: en
+Country: US
+--></g></svg>
\ No newline at end of file
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
index f0c09d98a..6ec54a2f8 100644
--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -2,6 +2,7 @@
 #include "taco/cuda.h"
 #include "codegen_cuda.h"
 #include "codegen_c.h"
+#include "codegen_ispc.h"
 #include <algorithm>
 #include <unordered_set>
 
@@ -26,6 +27,21 @@ shared_ptr<CodeGen> CodeGen::init_default(std::ostream &dest, OutputKind outputK
   if (should_use_CUDA_codegen()) {
     return make_shared<CodeGen_CUDA>(dest, outputKind);
   }
+  else if (should_use_ISPC_codegen()) {
+    return make_shared<CodeGen_ISPC>(dest, outputKind);
+  }
+  else {
+    return make_shared<CodeGen_C>(dest, outputKind);
+  }
+}
+
+shared_ptr<CodeGen> CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) {
+  if (should_use_CUDA_codegen()) {
+    return make_shared<CodeGen_CUDA>(dest, outputKind);
+  }
+  else if (should_use_ISPC_codegen()) {
+    return make_shared<CodeGen_ISPC>(dest, dest2, outputKind);
+  }
   else {
     return make_shared<CodeGen_C>(dest, outputKind);
   }
@@ -229,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
   return ret.str();
 }
 
+string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op,
+                            bool is_output_prop) {
+  stringstream ret;
+  ret << "";
+
+  auto tensor = op->tensor.as<Var>();
+  if (op->property == TensorProperty::Values) {
+    // for the values, it's in the last slot
+    ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]";
+    return ret.str();
+  } else if (op->property == TensorProperty::ValuesSize) {
+    ret << "int32 " << varname;
+    return ret.str();
+  }
+
+  // for a Dense level, nnz is an int
+  // for a Fixed level, ptr is an int
+  // all others are int*
+  if (op->property == TensorProperty::Dimension) {
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "int ";
+    }
+    ret << varname;
+    
+  } else {
+    taco_iassert(op->property == TensorProperty::Indices);
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "uniform int ";
+    }
+    ret << varname << "[]";
+  }
+
+  return ret.str();
+}
+
 string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
                             bool is_output_prop) {
   stringstream ret;
@@ -310,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) {
   return ret.str();
 }
 
-// helper to print declarations
-string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
-                           vector<Expr> inputs, vector<Expr> outputs) {
-  stringstream ret;
-  unordered_set<string> propsAlreadyGenerated;
-
-  vector<const GetProperty*> sortedProps;
+void CodeGen::getSortedProps(map<Expr, string, ExprCompare> &varMap,
+              vector<const GetProperty*> &sortedProps, vector<Expr> &inputs,
+              vector<Expr> &outputs) {
 
   for (auto const& p: varMap) {
     if (p.first.as<GetProperty>())
@@ -355,6 +410,17 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
          return a->index < b->index;
        });
 
+}
+
+// helper to print declarations
+string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
+                           vector<Expr> inputs, vector<Expr> outputs) {
+  stringstream ret;
+  unordered_set<string> propsAlreadyGenerated;
+
+  vector<const GetProperty*> sortedProps;
+  getSortedProps(varMap, sortedProps, inputs, outputs);
+
   for (auto prop: sortedProps) {
     bool isOutputProp = (find(outputs.begin(), outputs.end(),
                               prop->tensor) != outputs.end());
@@ -375,7 +441,6 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
   return ret.str();
 }
 
-
 string CodeGen::printPack(map<tuple<Expr, TensorProperty, int, int>,
         string> outputProperties, vector<Expr> outputs) {
   stringstream ret;
diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h
index cc25c80d6..db891f995 100644
--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -16,9 +16,13 @@ class CodeGen : public IRPrinter {
   enum CodeGenType { C, CUDA };
 
   CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {};
-  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {};
+  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) 
+    : IRPrinter(stream, color, simplify), codeGenType(type) {};
+  CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) 
+    : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {};
   /// Initialize the default code generator
   static std::shared_ptr<CodeGen> init_default(std::ostream &dest, OutputKind outputKind);
+  static std::shared_ptr<CodeGen> init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind);
 
   /// Compile a lowered function
   virtual void compile(Stmt stmt, bool isFirst=false) =0;
@@ -26,6 +30,9 @@ class CodeGen : public IRPrinter {
 protected:
   static bool checkForAlloc(const Function *func);
   static int countYields(const Function *func);
+  void getSortedProps(std::map<Expr, std::string, ExprCompare> &varMap,
+              std::vector<const GetProperty*> &sortedProps, std::vector<Expr> &inputs,
+              std::vector<Expr> &outputs);
 
   static std::string printCType(Datatype type, bool is_ptr);
   static std::string printCUDAType(Datatype type, bool is_ptr);
@@ -52,6 +59,10 @@ class CodeGen : public IRPrinter {
   std::string printFuncName(const Function *func, 
           std::map<Expr, std::string, ExprCompare> inputMap={}, 
           std::map<Expr, std::string, ExprCompare> outputMap={});
+  
+  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
+  std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op,
+                              bool is_output_prop); 
 
   void resetUniqueNameCounters();
   std::string genUniqueName(std::string name);
@@ -61,9 +72,8 @@ class CodeGen : public IRPrinter {
 private:
   virtual std::string restrictKeyword() const { return ""; }
 
-  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
   std::string unpackTensorProperty(std::string varname, const GetProperty* op,
-                              bool is_output_prop);
+                              bool is_output_prop); 
   std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
                             int mode, int index);
   std::string pointTensorProperty(std::string varname);
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 2ade9d7f6..83da7aaab 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -34,6 +34,7 @@ const string cHeaders =
   "#include <math.h>\n"
   "#include <complex.h>\n"
   "#include <string.h>\n"
+  "#include <omp.h>\n"
   "#if _OPENMP\n"
   "#include <omp.h>\n"
   "#endif\n"
@@ -240,7 +241,10 @@ class CodeGen_C::FindVars : public IRVisitor {
 };
 
 CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify)
-    : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {}
+    : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {}
+  
+CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify)
+    : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {}
 
 CodeGen_C::~CodeGen_C() {}
 
@@ -299,14 +303,18 @@ void CodeGen_C::visit(const Function* func) {
 
   // Print variable declarations
   out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+  // out << "printf(\"declarations added\\n\");" << std::endl;
 
   if (emittingCoroutine) {
     out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
         << endl;
   }
+  // out << "printf(\"declarations added2\\n\");" << std::endl;
 
   // output body
   print(func->body);
+  // out << "printf(\"function body added " << count++ << "\\n\"); // " << std::endl;
+
 
   // output repack only if we allocated memory
   if (checkForAlloc(func))
@@ -403,6 +411,9 @@ static string getAtomicPragma() {
 // Docs for vectorization pragmas:
 // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
 void CodeGen_C::visit(const For* op) {
+
+  // out << "    printf(\"adding for loop " << count++ << "\\n\"); //" << std::endl;
+
   switch (op->kind) {
     case LoopKind::Vectorized:
       doIndent();
@@ -452,6 +463,14 @@ void CodeGen_C::visit(const For* op) {
   }
   stream << ") {\n";
 
+  // out << "  printf(\"loop " << count++ << " : %d  , dim: %d, %d\\n\",";
+  // op->var.accept(this);
+  // out << ", ";
+  // op->start.accept(this);
+  // out << ", ";
+  // op->end.accept(this);
+  // out << "); // " << count++ << std::endl;
+
   op->contents.accept(this);
   doIndent();
   stream << "}";
@@ -472,6 +491,7 @@ void CodeGen_C::visit(const While* op) {
 }
 
 void CodeGen_C::visit(const GetProperty* op) {
+  // std::cout << "GetProperty* " << op << std::endl;
   taco_iassert(varMap.count(op) > 0) <<
       "Property " << Expr(op) << " of " << op->tensor << " not found in varMap";
   out << varMap[op];
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 55c9d01a8..c8505a3bb 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen {
   /// Initialize a code generator that generates code to an
   /// output stream.
   CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true);
+  CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true);
   ~CodeGen_C();
 
   /// Compile a lowered function
@@ -28,23 +29,25 @@ class CodeGen_C : public CodeGen {
 protected:
   using IRPrinter::visit;
 
-  void visit(const Function*);
-  void visit(const VarDecl*);
-  void visit(const Yield*);
-  void visit(const Var*);
-  void visit(const For*);
-  void visit(const While*);
-  void visit(const GetProperty*);
-  void visit(const Min*);
-  void visit(const Max*);
-  void visit(const Allocate*);
-  void visit(const Sqrt*);
-  void visit(const Store*);
-  void visit(const Assign*);
+  virtual void visit(const Function*);
+  virtual void visit(const VarDecl*);
+  virtual void visit(const Yield*);
+  virtual void visit(const Var*);
+  virtual void visit(const For*);
+  virtual void visit(const While*);
+  virtual void visit(const GetProperty*);
+  virtual void visit(const Min*);
+  virtual void visit(const Max*);
+  virtual void visit(const Allocate*);
+  virtual void visit(const Sqrt*);
+  virtual void visit(const Store*);
+  virtual void visit(const Assign*);
 
   std::map<Expr, std::string, ExprCompare> varMap;
   std::vector<Expr> localVars;
   std::ostream &out;
+  std::ostream &out2;
+  int count = 0;
   
   OutputKind outputKind;
 
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 77cf0cd88..14505f740 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
   // Collect device functions
   resetUniqueNameCounters();
   deviceFunctionLoopDepth = 0;
+  // here they calculate the device FunctionCollecor
   DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this);
   func->body.accept(&deviceFunctionCollector);
   deviceFunctions = deviceFunctionCollector.blockFors;
diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp
new file mode 100644
index 000000000..d4f428ccf
--- /dev/null
+++ b/src/codegen/codegen_ispc.cpp
@@ -0,0 +1,1097 @@
+#include <iostream>
+#include <fstream>
+#include <dlfcn.h>
+#include <algorithm>
+#include <unordered_set>
+#include <taco.h>
+
+#include "taco/cuda.h"
+#include "taco/ir/ir_printer.h"
+#include "taco/ir/ir_visitor.h"
+#include "taco/ir/ir_rewriter.h"
+#include "taco/ir/simplify.h"
+
+#include "codegen_c.h"
+#include "codegen_ispc.h"
+#include "taco/error.h"
+#include "taco/util/strings.h"
+#include "taco/util/collections.h"
+
+using namespace std;
+
+namespace taco {
+namespace ir {
+
+// Some helper functions
+namespace {
+
+// Include stdio.h for printf
+// stdlib.h for malloc/realloc
+// math.h for sqrt
+// MIN preprocessor macro
+// This *must* be kept in sync with taco_tensor_t.h
+const string cHeaders =
+  "#ifndef TACO_C_HEADERS\n"
+  "#define TACO_C_HEADERS\n"
+  "#include <stdio.h>\n"
+  "#include <stdlib.h>\n"
+  "#include <stdint.h>\n"
+  "#include <stdbool.h>\n"
+  "#include <math.h>\n"
+  "#include <complex.h>\n"
+  "#include <string.h>\n"
+  "#if _OPENMP\n"
+  "#include <omp.h>\n"
+  "#endif\n"
+  "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
+  "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
+  "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
+  "#ifndef TACO_TENSOR_T_DEFINED\n"
+  "#define TACO_TENSOR_T_DEFINED\n"
+  "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n"
+  "typedef struct {\n"
+  "  int32_t      order;         // tensor order (number of modes)\n"
+  "  int32_t*     dimensions;    // tensor dimensions\n"
+  "  int32_t      csize;         // component size\n"
+  "  int32_t*     mode_ordering; // mode storage ordering\n"
+  "  taco_mode_t* mode_types;    // mode storage types\n"
+  "  uint8_t***   indices;       // tensor index data (per mode)\n"
+  "  uint8_t*     vals;          // tensor values\n"
+  "  int32_t      vals_size;     // values array size\n"
+  "} taco_tensor_t;\n"
+  "#endif\n"
+  "#if !_OPENMP\n"
+  "int omp_get_thread_num() { return 0; }\n"
+  "int omp_get_max_threads() { return 1; }\n"
+  "#endif\n"
+  "int cmp(const void *a, const void *b) {\n"
+  "  return *((const int*)a) - *((const int*)b);\n"
+  "}\n"
+  "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayStart] >= target) {\n"
+  "    return arrayStart;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always < target\n"
+  "  int upperBound = arrayEnd; // always >= target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return upperBound;\n"
+  "}\n"
+  "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayEnd] <= target) {\n"
+  "    return arrayEnd;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always <= target\n"
+  "  int upperBound = arrayEnd; // always > target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return lowerBound;\n"
+  "}\n"
+  "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n"
+  "                                  int32_t* dimensions, int32_t* mode_ordering,\n"
+  "                                  taco_mode_t* mode_types) {\n"
+  "  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n"
+  "  t->order         = order;\n"
+  "  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));\n"
+  "  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n"
+  "  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n"
+  "  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n"
+  "  t->csize         = csize;\n"
+  "  for (int32_t i = 0; i < order; i++) {\n"
+  "    t->dimensions[i]    = dimensions[i];\n"
+  "    t->mode_ordering[i] = mode_ordering[i];\n"
+  "    t->mode_types[i]    = mode_types[i];\n"
+  "    switch (t->mode_types[i]) {\n"
+  "      case taco_mode_dense:\n"
+  "        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n"
+  "        break;\n"
+  "      case taco_mode_sparse:\n"
+  "        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n"
+  "        break;\n"
+  "    }\n"
+  "  }\n"
+  "  return t;\n"
+  "}\n"
+  "void deinit_taco_tensor_t(taco_tensor_t* t) {\n"
+  "  for (int i = 0; i < t->order; i++) {\n"
+  "    free(t->indices[i]);\n"
+  "  }\n"
+  "  free(t->indices);\n"
+  "  free(t->dimensions);\n"
+  "  free(t->mode_ordering);\n"
+  "  free(t->mode_types);\n"
+  "  free(t);\n"
+  "}\n"
+  "#endif\n";
+
+const string ispcHeaders = 
+  "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
+  "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
+  "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
+  "int __cmp(const void *a, const void *b) {\n"
+  "  return *((const int*)a) - *((const int*)b);\n"
+  "}\n"
+  "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayStart] >= target) {\n"
+  "    return arrayStart;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always < target\n"
+  "  int upperBound = arrayEnd; // always >= target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return upperBound;\n"
+  "}\n"
+  "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayEnd] <= target) {\n"
+  "    return arrayEnd;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always <= target\n"
+  "  int upperBound = arrayEnd; // always > target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return lowerBound;\n"
+  "}\n\n\n";
+
+} // anonymous namespace
+
+
+
+// find variables for generating declarations
+// generates a single var for each GetProperty
+class CodeGen_ISPC::FindVars : public IRVisitor {
+public:
+  map<Expr, string, ExprCompare> varMap;
+
+  // the variables for which we need to add declarations
+  map<Expr, string, ExprCompare> varDecls;
+
+  vector<Expr> localVars;
+
+  // this maps from tensor, property, mode, index to the unique var
+  map<tuple<Expr, TensorProperty, int, int>, string> canonicalPropertyVar;
+
+  // this is for convenience, recording just the properties unpacked
+  // from the output tensor so we can re-save them at the end
+  map<tuple<Expr, TensorProperty, int, int>, string> outputProperties;
+
+  // TODO: should replace this with an unordered set
+  vector<Expr> outputTensors;
+  vector<Expr> inputTensors;
+
+  CodeGen_ISPC *codeGen;
+
+  // copy inputs and outputs into the map
+  FindVars(vector<Expr> inputs, vector<Expr> outputs, CodeGen_ISPC *codeGen)
+  : codeGen(codeGen) {
+    for (auto v: inputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Inputs must be vars in codegen";
+      taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen";
+      inputTensors.push_back(v);
+      varMap[var] = var->name;
+    }
+    for (auto v: outputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Outputs must be vars in codegen";
+      taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen";
+      outputTensors.push_back(v);
+      varMap[var] = var->name;
+    }
+  }
+
+protected:
+  using IRVisitor::visit;
+
+  virtual void visit(const Var *op) {
+    if (varMap.count(op) == 0) {
+      varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name);
+    }
+  }
+
+  virtual void visit(const VarDecl *op) {
+    if (!util::contains(localVars, op->var)) {
+      localVars.push_back(op->var);
+    }
+    op->var.accept(this);
+    op->rhs.accept(this);
+  }
+
+  virtual void visit(const For *op) {
+    if (!util::contains(localVars, op->var)) {
+      localVars.push_back(op->var);
+    }
+    op->var.accept(this);
+    op->start.accept(this);
+    op->end.accept(this);
+    op->increment.accept(this);
+    op->contents.accept(this);
+  }
+
+  virtual void visit(const GetProperty *op) {
+    if (!util::contains(inputTensors, op->tensor) &&
+        !util::contains(outputTensors, op->tensor)) {
+      // Don't create header unpacking code for temporaries
+      return;
+    }
+
+    if (varMap.count(op) == 0) {
+      auto key =
+              tuple<Expr,TensorProperty,int,int>(op->tensor,op->property,
+                                                 (size_t)op->mode,
+                                                 (size_t)op->index);
+      if (canonicalPropertyVar.count(key) > 0) {
+        varMap[op] = canonicalPropertyVar[key];
+      } else {
+        auto unique_name = codeGen->genUniqueName(op->name);
+        canonicalPropertyVar[key] = unique_name;
+        varMap[op] = unique_name;
+        varDecls[op] = unique_name;
+        if (util::contains(outputTensors, op->tensor)) {
+          outputProperties[key] = unique_name;
+        }
+      }
+    }
+  }
+};
+
+
+// Finds all for loops tagged with accelerator and adds statements to deviceFunctions
+// Also tracks scope of when device function is called and
+// tracks which variables must be passed to function.
+class CodeGen_ISPC::FunctionCollector : public IRVisitor {
+public:
+  vector<Stmt> threadFors; // contents is device function
+  vector<Stmt> initFors;  // for loops to initialize statements
+  map<Expr, string, ExprCompare> scopeMap;
+
+  // the variables to pass to each device function
+  vector<vector<pair<string, Expr>>> functionParameters;
+  vector<pair<string, Expr>> currentParameters; // keep as vector so code generation is deterministic
+  set<Expr> currentParameterSet;
+
+  set<Expr> variablesDeclaredInKernel;
+
+  vector<pair<string, Expr>> threadIDVars;
+  vector<pair<string, Expr>> blockIDVars;
+  vector<pair<string, Expr>> warpIDVars;
+  vector<Expr> numThreads;
+  vector<Expr> numWarps;
+
+  CodeGen_ISPC *codeGen;
+  // copy inputs and outputs into the map
+  FunctionCollector(vector<Expr> inputs, vector<Expr> outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen)  {
+    inDeviceFunction = false;
+    for (auto v: inputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Inputs must be vars in codegen";
+      taco_iassert(scopeMap.count(var) == 0) <<
+                                             "Duplicate input found in codegen";
+      scopeMap[var] = var->name;
+    }
+    for (auto v: outputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Outputs must be vars in codegen";
+      taco_iassert(scopeMap.count(var) == 0) <<
+                                             "Duplicate output found in codegen";
+
+      scopeMap[var] = var->name;
+    }
+  }
+
+protected:
+  bool inDeviceFunction;
+  using IRVisitor::visit;
+
+  virtual void visit(const For *op) {
+    if (op->parallel_unit == ParallelUnit::CPUSpmd) {
+      std::cout << "ParallelUnit::CPUSpmd directive found\n";
+
+      inDeviceFunction = false;
+      op->var.accept(this);
+      inDeviceFunction = true;
+
+      threadFors.push_back(op);
+      std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n";
+      threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
+      Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
+      numThreads.push_back(blockSize);
+
+    }
+    else if (op->parallel_unit == ParallelUnit::CPUSimd) {
+      std::cout << "************************************************************************** CPUSimd For node\n";
+    }
+    else if (op->kind == LoopKind::Init) {
+      std::cout << "************************************************************************* Init loop kind found\n";
+      initFors.push_back(op);
+    }
+    else{
+      op->var.accept(this);
+    }
+    op->start.accept(this);
+    op->end.accept(this);
+    op->increment.accept(this);
+    op->contents.accept(this);
+  }
+
+  virtual void visit(const Var *op) {
+    if (scopeMap.count(op) == 0) {
+      string name = codeGen->genUniqueName(op->name);
+      if (!inDeviceFunction) {
+        scopeMap[op] = name;
+      }
+    }
+    else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0
+            && (threadIDVars.empty() || op != threadIDVars.back().second)
+            && !variablesDeclaredInKernel.count(op)) {
+      currentParameters.push_back(pair<string, Expr>(scopeMap[op], op));
+      currentParameterSet.insert(op);
+    }
+  }
+
+  virtual void visit(const VarDecl *op) {
+    if (inDeviceFunction) {
+      variablesDeclaredInKernel.insert(op->var);
+    }
+    op->var.accept(this);
+    op->rhs.accept(this);
+  }
+
+  virtual void visit(const GetProperty *op) {
+    if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) {
+      auto key =
+              tuple<Expr,TensorProperty,int,int>(op->tensor,op->property,
+                                                 (size_t)op->mode,
+                                                 (size_t)op->index);
+      auto unique_name = codeGen->genUniqueName(op->name);
+      scopeMap[op->tensor] = unique_name;
+    }
+    else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) {
+      currentParameters.push_back(pair<string, Expr>(op->tensor.as<Var>()->name, op->tensor));
+      currentParameterSet.insert(op->tensor);
+    }
+  }
+};
+
+
+CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify)
+    : CodeGen_C(dest, dest, outputKind, simplify) {}
+
+CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify)
+    : CodeGen_C(dest, dest2, outputKind, simplify) {}
+
+CodeGen_ISPC::~CodeGen_ISPC() {}
+
+void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) {
+  varMap = {};
+  localVars = {};
+
+  if (isFirst) {
+    // output the headers
+    out << cHeaders;
+
+    if (&out != &out2) {
+      out2 << ispcHeaders;
+    }
+  }
+  out << endl;
+  // generate code for the Stmt
+  std::cout << "Compiling the code\n";
+  stmt.accept(this);
+}
+
+
+
+string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map<Expr, string, ExprCompare> varMap,
+                                  vector<const GetProperty*> &sortedProps) {
+  std::stringstream ret;
+  ret << "  ";
+  unordered_set<string> propsAlreadyGenerated;
+
+  ret << "__" << funcName << "(";
+
+
+  for (unsigned long i=0; i < sortedProps.size(); i++) {
+    ret << varMap[sortedProps[i]];
+    if (i != sortedProps.size()-1) {
+      ret << ", ";
+    }
+    propsAlreadyGenerated.insert(varMap[sortedProps[i]]);
+  }
+
+  ret << ");\n";
+  return ret.str();
+}
+
+// varMap is already sorted <- make sure to pass the sorted varMap
+void CodeGen_ISPC::printISPCFunc(const Function *func, map<Expr, string, ExprCompare> varMap,
+                                  vector<const GetProperty*> &sortedProps) {
+
+  FunctionCollector functionCollector(func->inputs, func->outputs, this);
+  func->body.accept(&functionCollector);
+
+  vector<Expr> inputs = func->inputs;
+  vector<Expr> outputs = func->outputs;
+  unordered_set<string> propsAlreadyGenerated;
+
+  for (unsigned long i=0; i < sortedProps.size(); i++) {
+    auto prop = sortedProps[i];
+    bool isOutputProp = (find(outputs.begin(), outputs.end(),
+                              prop->tensor) != outputs.end());
+    
+    auto var = prop->tensor.as<Var>();
+    if (var->is_parameter) {
+      if (isOutputProp) {
+        funcVariables << "  " << printTensorProperty(varMap[prop], prop, false) << ";" << endl;
+      } else {
+        break; 
+      }
+    } else {
+      funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp);
+    }
+    propsAlreadyGenerated.insert(varMap[prop]);
+
+    if (i!=sortedProps.size()-1) {
+      funcVariables << ", ";
+    }
+    if (i%2==0) {
+      funcVariables << "\n\t";
+    }
+  }
+
+  resetUniqueNameCounters();
+
+  // threadFors code generation
+  for (size_t i = 0; i < functionCollector.threadFors.size(); i++) {
+
+    const For *threadloop = to<For>(functionCollector.threadFors[i]);
+    taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd);
+    Stmt function = threadloop->contents;
+    std::cout << "threadloop function: " << function << std::endl;
+
+    out2 << "\nstatic task void __" << func->name << "__ (";
+    out2 << funcVariables.str();
+    out2 << "\n) {\n\n";
+
+    indent++;
+    // output body of the threadloop
+    taskCode = true;
+    print(threadloop);
+    indent--;
+    out2 << "}\n\n";  
+
+  }
+
+  taskCode = false;
+  out2 << "export void __" << func->name << " (";
+  out2 << funcVariables.str();
+  out2 << "\n) {\n\n";
+
+  indent++;
+  // output body
+  print(func->body);
+  indent--;
+  out2 << "}\n";
+  
+}
+
+void CodeGen_ISPC::sendToStream(std::stringstream &stream) {
+  if (is_ISPC_code_stream_enabled()) {
+    this->out2 << stream.str();
+  }
+  else {
+    CodeGen_C::sendToStream(stream);
+  }
+}
+
+void CodeGen_ISPC::visit(const Function* func) {
+  set_ISPC_code_stream_enabled(false);
+
+  // if generating a header, protect the function declaration with a guard
+  if (func->name == "assemble") {
+    if (outputKind == HeaderGen) {
+      out << "#ifndef TACO_GENERATED_" << func->name << "\n";
+      out << "#define TACO_GENERATED_" << func->name << "\n";
+    }
+
+    int numYields = countYields(func);
+    emittingCoroutine = (numYields > 0);
+    funcName = func->name;
+    labelCount = 0;
+
+    resetUniqueNameCounters();
+    FindVars inputVarFinder(func->inputs, {}, this);
+    func->body.accept(&inputVarFinder);
+    FindVars outputVarFinder({}, func->outputs, this);
+    func->body.accept(&outputVarFinder);
+
+    // output function declaration
+    doIndent();
+    out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls);
+
+    // if we're just generating a header, this is all we need to do
+    if (outputKind == HeaderGen) {
+      out << ";\n";
+      out << "#endif\n";
+      return;
+    }
+
+    out << " {\n";
+
+    indent++;
+
+    // find all the vars that are not inputs or outputs and declare them
+    resetUniqueNameCounters();
+    FindVars varFinder(func->inputs, func->outputs, this);
+    func->body.accept(&varFinder);
+    varMap = varFinder.varMap;
+    localVars = varFinder.localVars;
+
+    // Print variable declarations
+    out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+
+    if (emittingCoroutine) {
+      out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
+          << endl;
+    }
+
+    // output body
+    print(func->body);
+
+    // output repack only if we allocated memory
+    if (checkForAlloc(func))
+      out << endl << printPack(varFinder.outputProperties, func->outputs);
+
+    if (emittingCoroutine) {
+      out << printCoroutineFinish(numYields, funcName);
+    }
+
+    doIndent();
+    out << "return 0;\n";
+    indent--;
+
+    doIndent();
+    out << "}\n";
+    return;
+
+  }
+
+
+  if (outputKind == HeaderGen) {
+    out << "#ifndef TACO_GENERATED_" << func->name << "\n";
+    out << "#define TACO_GENERATED_" << func->name << "\n";
+  }
+
+  int numYields = countYields(func);
+  emittingCoroutine = (numYields > 0);
+  funcName = func->name;
+  labelCount = 0;
+
+  resetUniqueNameCounters();
+  FindVars inputVarFinder(func->inputs, {}, this);
+  func->body.accept(&inputVarFinder);
+  FindVars outputVarFinder({}, func->outputs, this);
+  func->body.accept(&outputVarFinder);
+
+  // output function declaration
+  doIndent();
+  out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls);
+
+  // if we're just generating a header, this is all we need to do
+  if (outputKind == HeaderGen) {
+    out << ";\n";
+    out << "#endif\n";
+    return;
+  }
+
+  out << " {\n";
+
+  indent++;
+
+  // find all the vars that are not inputs or outputs and declare them
+  resetUniqueNameCounters();
+  FindVars varFinder(func->inputs, func->outputs, this);
+  func->body.accept(&varFinder);
+  varMap = varFinder.varMap;
+  localVars = varFinder.localVars;
+
+  // Print variable declarations
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+
+  sortedProps = {};
+  vector<Expr> inputs = func->inputs;
+  vector<Expr> outputs = func->outputs;
+  getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs);
+  out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps);
+
+  if (emittingCoroutine) {
+    out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
+        << endl;
+  }
+
+  // output repack only if we allocated memory
+  if (checkForAlloc(func))
+    out << endl << printPack(varFinder.outputProperties, func->outputs);
+
+  if (emittingCoroutine) {
+    out << printCoroutineFinish(numYields, funcName);
+  }
+
+  doIndent();
+  out << "return 0;\n";
+  indent--;
+
+  doIndent();
+  out << "}\n\n";
+
+  set_ISPC_code_stream_enabled(true);
+  printISPCFunc(func, varFinder.varDecls, sortedProps);
+  set_ISPC_code_stream_enabled(false);
+
+}
+
+void CodeGen_ISPC::visit(const VarDecl* op) {
+  // std::stringstream stream;
+  if (is_ISPC_code_stream_enabled()) {
+    if (emittingCoroutine) {
+      doIndent();
+      op->var.accept(this);
+      parentPrecedence = Precedence::TOP;
+      stream2 << " = ";
+      op->rhs.accept(this);
+      stream2 << ";";
+      stream2 << endl;
+    } else {
+      IRPrinter::visit(op);
+    }
+  }
+  else {
+    CodeGen_C::visit(op);   
+  }
+
+  // sendToStream(stream);
+}
+
+void CodeGen_ISPC::visit(const Yield* op) {
+  printYield(op, localVars, varMap, labelCount, funcName);
+}
+
+// For Vars, we replace their names with the generated name,
+// since we match by reference (not name)
+void CodeGen_ISPC::visit(const Var* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    taco_iassert(varMap.count(op) > 0) <<
+        "Var " << op->name << " not found in varMap";
+    if (emittingCoroutine) {
+  //    out << "TACO_DEREF(";
+    }
+    out2 << varMap[op];
+    if (emittingCoroutine) {
+  //    out << ")";
+    }
+  }
+  else {
+    CodeGen_C::visit(op);
+  }
+}
+
+static string genVectorizePragma(int width) {
+  stringstream ret;
+  ret << "#pragma clang loop interleave(enable) ";
+  if (!width)
+    ret << "vectorize(enable)";
+  else
+    ret << "vectorize_width(" << width << ")";
+
+  return ret.str();
+}
+
+// static string getParallelizePragma(LoopKind kind) {
+//   stringstream ret;
+//   ret << "#pragma omp parallel for schedule";
+//   switch (kind) {
+//     case LoopKind::Static:
+//       ret << "(static, 1)";
+//       break;
+//     case LoopKind::Dynamic:
+//       ret << "(dynamic, 1)";
+//       break;
+//     case LoopKind::Runtime:
+//       ret << "(runtime)";
+//       break;
+//     case LoopKind::Static_Chunked:
+//       ret << "(static)";
+//       break;
+//     default:
+//       break;
+//   }
+//   return ret.str();
+// }
+
+// static string getUnrollPragma(size_t unrollFactor) {
+//   return "#pragma unroll " + std::to_string(unrollFactor);
+// }
+
+static string getAtomicPragma() {
+  return "#pragma omp atomic";
+}
+
+// The next two need to output the correct pragmas depending
+// on the loop kind (Serial, Static, Dynamic, Vectorized)
+//
+// Docs for vectorization pragmas:
+// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
+void CodeGen_ISPC::visit(const For* op) {
+  if (!is_ISPC_code_stream_enabled()) {
+    CodeGen_C::visit(op);
+    return;
+  }
+  doIndent();
+
+  if (op->kind == LoopKind::Mul_Thread) {
+    if (!taskCode) {
+      out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n";
+      return;
+    }
+    stream2 << "uniform unsigned int chunk_size = (";
+    op->end.accept(this);
+    stream2 << " - ";
+    op->start.accept(this);
+    stream2 << ") / taskCount;\n";
+    stream2 << "  uniform unsigned int modulo = (";
+    op->end.accept(this);
+    stream2 << " - ";
+    op->start.accept(this);
+    stream2 << ") % taskCount;\n";
+
+    stream2 << "  uniform unsigned int start = ";
+    op->start.accept(this);
+    stream2 << " + chunk_size * taskIndex;\n";
+
+    stream2 << "  if (taskIndex != 0) {\n";
+    stream2 << "    start += modulo;\n";
+    stream2 << "  }\n";
+    
+    stream2 << "  uniform unsigned int end = start + chunk_size;\n";
+    stream2 << "  if (taskIndex == 0) {\n";
+    stream2 << "    end += modulo;\n";
+    stream2 << "  }\n\n";
+        
+    stream2 << keywordString("  for") << " (";
+    if (!emittingCoroutine) {
+      if (op->var.type() == Int32) {
+          stream2 << "int32 ";
+      }
+      else if (op->var.type() == Int64) {
+          stream2 << "int64 ";
+      }
+      
+    }
+    op->var.accept(this);
+    stream2 << " = ";
+    stream2 << "start";
+    // op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    stream2 << "end";
+    // op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+
+  }
+
+  else if (op->kind == LoopKind::Foreach) {
+    stream2 << keywordString("foreach") << " (";
+
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString(" ... ");
+    op->end.accept(this);
+
+  } else {
+    stream2 << keywordString("for") << " (";
+    if (!emittingCoroutine) {
+      if (op->var.type() == Int32) {
+          stream2 << "int32 ";
+      }
+      else if (op->var.type() == Int64) {
+          stream2 << "int64 ";
+      }
+      
+    }
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+    
+  }
+
+  stream2 << ") {\n";
+  op->contents.accept(this);
+  doIndent();
+  stream2 << "}";
+  stream2 << endl;
+
+}
+
+void CodeGen_ISPC::visit(const While* op) {
+  // it's not clear from documentation that clang will vectorize
+  // while loops
+  // however, we'll output the pragmas anyway
+  if (op->kind == LoopKind::Vectorized) {
+    doIndent();
+    out << genVectorizePragma(op->vec_width);
+    out << "\n";
+  }
+
+  CodeGen_C::visit(op);
+}
+
+void CodeGen_ISPC::visit(const GetProperty* op) {
+  taco_iassert(varMap.count(op) > 0) <<
+      "Property " << Expr(op) << " of " << op->tensor << " not found in varMap";
+  if (is_ISPC_code_stream_enabled()) {
+    out2 << varMap[op];
+  }
+  else {
+    out << varMap[op];
+  }
+
+}
+
+void CodeGen_ISPC::visit(const Min* op) {
+  if (op->operands.size() == 1) {
+    op->operands[0].accept(this);
+    return;
+  }
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << "TACO_MIN(";
+    op->operands[i].accept(this);
+    stream << ",";
+  }
+  op->operands.back().accept(this);
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << ")";
+  }
+}
+
+void CodeGen_ISPC::visit(const Max* op) {
+  if (op->operands.size() == 1) {
+    op->operands[0].accept(this);
+    return;
+  }
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << "TACO_MAX(";
+    op->operands[i].accept(this);
+    stream << ",";
+  }
+  op->operands.back().accept(this);
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << ")";
+  }
+}
+
+void CodeGen_ISPC::visit(const Allocate* op) {
+
+
+  if (is_ISPC_code_stream_enabled()) {
+    string elementType = printCType(op->var.type(), false);
+    doIndent();
+
+    op->var.accept(this);
+    stream2 << " = ";
+    // stream2 << " = (";
+    // stream2 << elementType << "*";
+    // stream2 << ")";
+    if (op->is_realloc) {
+      stream2 << "realloc(";
+      op->var.accept(this);
+      stream2 << ", ";
+    }
+    else {
+      // If the allocation was requested to clear the allocated memory,
+      // use calloc instead of malloc.
+      if (op->clear) {
+        stream2 << "calloc(1, ";
+      } else {
+        stream2 << "new ";
+      }
+    }
+    stream2 << elementType << "[";
+    parentPrecedence = MUL;
+    op->num_elements.accept(this);
+    parentPrecedence = TOP;
+    stream2 << "];";
+    stream2 << endl;
+
+
+  } else {
+    CodeGen_C::visit(op);
+
+  }
+
+
+}
+
+void CodeGen_ISPC::visit(const Sqrt* op) {
+  taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) <<
+      "Codegen doesn't currently support non-double sqrt";
+  stream << "sqrt(";
+  op->a.accept(this);
+  stream << ")";
+}
+
+void CodeGen_ISPC::visit(const Assign* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    op->lhs.accept(this);
+    parentPrecedence = Precedence::TOP;
+    bool printed = false;
+    if (simplify) {
+      if (isa<ir::Add>(op->rhs)) {
+        auto add = to<Add>(op->rhs);
+        if (add->a == op->lhs) {
+          const Literal* lit = add->b.as<Literal>();
+          if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                                (lit->type.isUInt() && lit->equalsScalar(1)))) {
+            stream2 << "++";
+          }
+          else {
+            if (op->use_atomics) {
+              stream2 << " += reduce_add(";
+              add->b.accept(this);
+              stream2 << ")";
+            }
+            else {
+              stream2 << " += ";
+              add->b.accept(this);
+            }
+          }
+          printed = true;
+        }
+      }
+      else if (isa<Mul>(op->rhs)) {
+        auto mul = to<Mul>(op->rhs);
+        if (mul->a == op->lhs) {
+          stream2 << " *= ";
+          mul->b.accept(this);
+          printed = true;
+        }
+      }
+      else if (isa<BitOr>(op->rhs)) {
+        auto bitOr = to<BitOr>(op->rhs);
+        if (bitOr->a == op->lhs) {
+          stream2 << " |= ";
+          bitOr->b.accept(this);
+          printed = true;
+        }
+      }
+    }
+    if (!printed) {
+      stream2 << " = ";
+      op->rhs.accept(this);
+    }
+
+    stream2 << ";";
+    stream2 << endl;
+
+    IRPrinter::visit(op);
+  }
+  else {
+    CodeGen_C::visit(op);
+  
+  }
+
+  
+}
+
+void CodeGen_ISPC::visit(const Store* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    if (op->use_atomics) {
+      doIndent();
+      stream2 << getAtomicPragma() << endl;
+    }
+  }
+  else {
+    if (op->use_atomics) {
+      doIndent();
+      stream << getAtomicPragma() << endl;
+    }    
+  }
+  IRPrinter::visit(op);
+}
+
+}
+}
diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h
new file mode 100644
index 000000000..62d2897ca
--- /dev/null
+++ b/src/codegen/codegen_ispc.h
@@ -0,0 +1,68 @@
+#ifndef TACO_BACKEND_ISPC_H
+#define TACO_BACKEND_ISPC_H
+#include <map>
+#include <vector>
+#include <stdbool.h>
+
+#include "taco/ir/ir.h"
+#include "taco/ir/ir_printer.h"
+#include "codegen_c.h"
+
+namespace taco {
+namespace ir {
+
+
+class CodeGen_ISPC : public CodeGen_C {
+public:
+  /// Initialize a code generator that generates code to an
+  /// output stream.
+  CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true);
+  CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true);
+  ~CodeGen_ISPC();
+
+  /// Compile a lowered function
+  void compile(Stmt stmt, bool isFirst=false);
+
+  /// Generate shims that unpack an array of pointers representing
+  /// a mix of taco_tensor_t* and scalars into a function call
+  static void generateShim(const Stmt& func, std::stringstream &stream);
+
+protected:
+  using CodeGen_C::visit;
+
+  void visit(const Function*);
+  void visit(const VarDecl*);
+  void visit(const Yield*);
+  void visit(const Var*);
+  void visit(const For*);
+  void visit(const While*);
+  void visit(const GetProperty*);
+  void visit(const Min*);
+  void visit(const Max*);
+  void visit(const Allocate*);
+  void visit(const Sqrt*);
+  void visit(const Store*);
+  void visit(const Assign*);
+
+  Stmt simplifyFunctionBodies(Stmt stmt);
+  std::string printCallISPCFunc(const std::string& funcName, std::map<Expr, std::string, ExprCompare> varMap,
+                                std::vector<const GetProperty*> &sortedProps);
+  void printISPCFunc(const Function *func, std::map<Expr, std::string, ExprCompare> varMap,
+                                  std::vector<const GetProperty*> &sortedProps);
+
+  bool taskCode = false;
+
+  std::stringstream funcVariables;
+  std::vector<const GetProperty*> sortedProps;
+
+  class FindVars;
+  class FunctionCollector;
+
+private:
+  virtual std::string restrictKeyword() const { return "restrict"; }
+  void sendToStream(std::stringstream &stream);
+};
+
+} // namespace ir
+} // namespace taco
+#endif
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index bd0f487b1..6f631d40e 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <dlfcn.h>
 #include <unistd.h>
+// #include </home/min/a/kadhitha/workspace/my_taco/valgrind/callgrind/callgrind.h>
 #if USE_OPENMP
 #include <omp.h>
 #endif
@@ -13,6 +14,7 @@
 #include "taco/util/strings.h"
 #include "taco/util/env.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "taco/cuda.h"
 
@@ -42,6 +44,7 @@ void Module::addFunction(Stmt func) {
 
 void Module::compileToSource(string path, string prefix) {
   if (!moduleFromUserSource) {
+    std::cout << "module not from user source\n";
   
     // create a codegen instance and add all the funcs
     bool didGenRuntime = false;
@@ -50,11 +53,13 @@ void Module::compileToSource(string path, string prefix) {
     header.clear();
     source.str("");
     source.clear();
+    additional_source.str("");
+    additional_source.clear();
 
     taco_tassert(target.arch == Target::C99) <<
         "Only C99 codegen supported currently";
     std::shared_ptr<CodeGen> sourcegen =
-        CodeGen::init_default(source, CodeGen::ImplementationGen);
+        CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen);
     std::shared_ptr<CodeGen> headergen =
             CodeGen::init_default(header, CodeGen::HeaderGen);
 
@@ -68,8 +73,17 @@ void Module::compileToSource(string path, string prefix) {
   ofstream source_file;
   string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
   source_file.open(path+prefix+file_ending);
+  if (should_use_ISPC_codegen()) {
+    source_file << "#include \"" << path+prefix+"_ispc.h\"\n";
+  }
   source_file << source.str();
   source_file.close();
+
+  ofstream additional_source_file;
+  string file_ending2 = ".ispc";
+  additional_source_file.open(path+prefix+file_ending2);
+  additional_source_file << additional_source.str();
+  additional_source_file.close();
   
   ofstream header_file;
   header_file.open(path+prefix+".h");
@@ -89,6 +103,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
     if (should_use_CUDA_codegen()) {
       CodeGen_CUDA::generateShim(func, shims);
     }
+    // else if (should_use_ISPC_codegen()) {
+    //   CodeGen_ISPC::generateShim(func, shims);
+    // }
     else {
       CodeGen_C::generateShim(func, shims);
     }
@@ -98,6 +115,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
   if (should_use_CUDA_codegen()) {
     shims_file.open(path+prefix+"_shims.cpp");
   }
+  // else if (should_use_ISPC_codegen()) {
+  //   shims_file.open(path+prefix+".c", ios::app);
+  // }
   else {
     shims_file.open(path+prefix+".c", ios::app);
   }
@@ -109,6 +129,7 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
 } // anonymous namespace
 
 string Module::compile() {
+  std::cout << "Module::compile\n";
   string prefix = tmpdir+libname;
   string fullpath = prefix + ".so";
   
@@ -123,6 +144,13 @@ string Module::compile() {
     file_ending = ".cu";
     shims_file = prefix + "_shims.cpp";
   }
+  // else if (should_use_ISPC_codegen()) {
+  //   cc = util::getFromEnv("TACO_ISPC", "ispc");
+  //   cflags = util::getFromEnv("TACO_ISPC_FLAGS",
+  //   " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64"
+  //   ) + " ";
+
+  // }
   else {
     cc = util::getFromEnv(target.compiler_env, target.compiler);
     cflags = util::getFromEnv("TACO_CFLAGS",
@@ -137,17 +165,55 @@ string Module::compile() {
   string cmd = cc + " " + cflags + " " +
     prefix + file_ending + " " + shims_file + " " + 
     "-o " + fullpath + " -lm";
+  std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl;
+  std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl;
+  std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl;
+  std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl;
+  std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
 
   // open the output file & write out the source
   compileToSource(tmpdir, libname);
+
   
   // write out the shims
   writeShims(funcs, tmpdir, libname);
+  for (auto &statement : funcs) {
+    std::cout << "----- statement --------" << std::endl;
+    // std::cout << statement;
+    std::cout << std::endl;
+  }
+  std::cout << tmpdir << std::endl << libname << std::endl;
   
-  // now compile it
-  int err = system(cmd.data());
-  taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
-    << "\nreturned " << err;
+  if (should_use_ISPC_codegen()) {
+    string ispc = util::getFromEnv("TACO_ISPC", "ispc");
+    string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS",
+    " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64"
+    ) + " ";
+    string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h";
+
+    // now compile the ispc file to generate the object file and the ispc header file
+    std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
+    int err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+
+    string ispc_object_file = " " + prefix + ".ispc.o ";
+    string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* ";
+    cmd = cc + " " + cflags + " " +
+      prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + 
+      "-o " + fullpath + " -lm -lrt ";
+
+    // now compile the c file linking the ispc object file. ispc header is added to the top of the c file
+    std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
+    err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+  } else {
+    // now compile it
+    int err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+  }
 
   // use dlsym() to open the compiled library
   if (lib_handle) {
@@ -168,10 +234,61 @@ string Module::getSource() {
   return source.str();
 }
 
+void* Module::getFuncPtr(std::string& sofile, std::string name) {
+  std::cout << "opening shared object 1\n";
+  if (so_lib_handle) {
+    dlclose(so_lib_handle);
+  }
+  std::cout << "opening shared object 2\n";
+  so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL);
+  std::cout << "opening shared object : " << sofile << std::endl;
+  return dlsym(so_lib_handle, name.data());
+}
+
 void* Module::getFuncPtr(std::string name) {
   return dlsym(lib_handle, name.data());
 }
 
+int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) {
+  typedef int (*fnptr_t)(void**);
+  static_assert(sizeof(void*) == sizeof(fnptr_t),
+    "Unable to cast dlsym() returned void pointer to function pointer");
+  void* v_func_ptr = getFuncPtr(sofile, name);
+  fnptr_t func_ptr;
+  *reinterpret_cast<void**>(&func_ptr) = v_func_ptr;
+
+#if USE_OPENMP
+  omp_sched_t existingSched;
+  ParallelSchedule tacoSched;
+  int existingChunkSize, tacoChunkSize;
+  int existingNumThreads = omp_get_max_threads();
+  omp_get_schedule(&existingSched, &existingChunkSize);
+  taco_get_parallel_schedule(&tacoSched, &tacoChunkSize);
+  switch (tacoSched) {
+    case ParallelSchedule::Static:
+      omp_set_schedule(omp_sched_static, tacoChunkSize);
+      break;
+    case ParallelSchedule::Dynamic:
+      omp_set_schedule(omp_sched_dynamic, tacoChunkSize);
+      break;
+    default:
+      break;
+  }
+  omp_set_num_threads(taco_get_num_threads());
+#endif
+
+  std::cout << "calling the function\n";
+  int ret = func_ptr(args);
+  std::cout << "function call completed\n";
+
+#if USE_OPENMP
+  omp_set_schedule(existingSched, existingChunkSize);
+  omp_set_num_threads(existingNumThreads);
+#endif
+
+  return ret;
+}
+
 int Module::callFuncPackedRaw(std::string name, void** args) {
   typedef int (*fnptr_t)(void**);
   static_assert(sizeof(void*) == sizeof(fnptr_t),
@@ -200,7 +317,13 @@ int Module::callFuncPackedRaw(std::string name, void** args) {
   omp_set_num_threads(taco_get_num_threads());
 #endif
 
+  std::cout << "calling the function\n";
+  //   CALLGRIND_START_INSTRUMENTATION;
+  // CALLGRIND_TOGGLE_COLLECT;
   int ret = func_ptr(args);
+  //   CALLGRIND_TOGGLE_COLLECT;
+  // CALLGRIND_STOP_INSTRUMENTATION;
+  std::cout << "function call completed\n";
 
 #if USE_OPENMP
   omp_set_schedule(existingSched, existingChunkSize);
diff --git a/src/cuda.cpp b/src/cuda.cpp
index 059c60105..68e49fe98 100644
--- a/src/cuda.cpp
+++ b/src/cuda.cpp
@@ -7,6 +7,25 @@
 
 using namespace std;
 namespace taco {
+
+static bool ISPC_codegen_enabled = ISPC_BUILT;
+static bool ISPC_code_stream_enabled = false;
+bool should_use_ISPC_codegen() {
+  return ISPC_codegen_enabled;
+}
+
+bool is_ISPC_code_stream_enabled() {
+  return ISPC_code_stream_enabled;
+}
+
+void set_ISPC_codegen_enabled(bool enabled) {
+  ISPC_codegen_enabled = enabled;
+}
+
+void set_ISPC_code_stream_enabled(bool enabled) {
+  ISPC_code_stream_enabled = enabled;
+}
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 static bool CUDA_codegen_enabled = CUDA_BUILT;
 static bool CUDA_unified_memory_enabled = CUDA_BUILT;
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 51fb8770c..2e26460c7 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) {
   return isConcrete;
 }
 
+// make reduction notation
 Assignment makeReductionNotation(Assignment assignment) {
   IndexExpr expr = assignment.getRhs();
   std::vector<IndexVar> free = assignment.getLhs().getIndexVars();
@@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) {
   return makeReductionNotation(to<Assignment>(stmt));
 }
 
+// make concrete notation
 IndexStmt makeConcreteNotation(IndexStmt stmt) {
+  // std::cout << "concrete notation original assignment: " << stmt << std::endl;
+
   std::string reason;
   taco_iassert(isReductionNotation(stmt, &reason))
       << "Not reduction notation: " << stmt << std::endl << reason;
@@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
 
   // Free variables and reductions covering the whole rhs become top level loops
   vector<IndexVar> freeVars = to<Assignment>(stmt).getFreeVars();
+  std::cout << "free vars: " << freeVars << std::endl;
 
   struct RemoveTopLevelReductions : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
         topLevelReductions.push_back(reduction.getVar());
         rhs = reduction.getExpr();
       }
+      // std::cout << "top level reductions: " << topLevelReductions << std::endl;
 
       if (rhs != node->rhs) {
-        stmt = Assignment(node->lhs, rhs, Add());
+        stmt = Assignment(node->lhs, rhs, Add()); // write with add
+        int idx = 0;
         for (auto& i : util::reverse(topLevelReductions)) {
+          std::cout << idx << ": " << stmt << std::endl;
+          idx++;
           stmt = forall(i, stmt);
         }
+        std::cout << idx << ": " << stmt << std::endl;
       }
       else {
         stmt = node;
@@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
     }
   };
   stmt = RemoveTopLevelReductions().rewrite(stmt);
+  // std::cout << "after remove top level reductions: " << stmt << std::endl;
 
+  // now we form the stmt in reverse order of freeVars
+  int idx = 0;
   for (auto& i : util::reverse(freeVars)) {
+    std::cout << idx << ": " << stmt << std::endl;
     stmt = forall(i, stmt);
+    idx++;
   }
+  std::cout << idx << ": " << stmt << std::endl;
 
+  std::cout << "replacing reductions with whereas statements\n";
   // Replace other reductions with where and forall statements
   struct ReplaceReductionsWithWheres : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp
index 0b41615ad..d7ee998ae 100644
--- a/src/index_notation/index_notation_printer.cpp
+++ b/src/index_notation/index_notation_printer.cpp
@@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) {
 void IndexNotationPrinter::visit(const ForallNode* op) {
   os << "forall(" << op->indexVar << ", ";
   op->stmt.accept(this);
-  if (op->parallel_unit != ParallelUnit::NotParallel) {
+  // if (op->parallel_unit != ParallelUnit::NotParallel) {
     os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy];
-  }
+  // }
   os << ")";
 }
 
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 47fc1dd55..c1d82a9fd 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1,9 +1,16 @@
 #include "taco/index_notation/transformations.h"
 
+#include "lower/iteration_graph.h"
+#include "lower/tensor_path.h"
+#include "taco/cuda.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/index_notation_nodes_abstract.h"
 #include "taco/index_notation/index_notation_rewriter.h"
 #include "taco/index_notation/index_notation_nodes.h"
+#include "taco/index_notation/index_notation_printer.h"
 #include "taco/error/error_messages.h"
+#include "taco/index_notation/intrinsic.h"
+#include "taco/type.h"
 #include "taco/util/collections.h"
 #include "taco/lower/iterator.h"
 #include "taco/lower/merge_lattice.h"
@@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
         IndexExpr e = precompute.getExpr();
         IndexVar iw = precompute.getiw();
 
+        // these lines of code looks interesting when creating the producer consumer relationship
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
         IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), 
                                                    assign.getOperator()));
@@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
     std::string reason = "";
 
     IndexStmt rewriteParallel(IndexStmt stmt) {
+      std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       provGraph = ProvenanceGraph(stmt);
+      std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
 
       const auto reductionVars = getReductionVars(stmt);
       reductionIndexVars.clear();
@@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
       tensorVars = createIRTensorVars(stmt);
 
       assembledByUngroupedInsert.clear();
+      std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
       for (const auto& result : getAssembledByUngroupedInsertion(stmt)) {
         assembledByUngroupedInsert.push_back(tensorVars[result]);
       }
 
+      std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       return rewrite(stmt);
     }
 
     void visit(const ForallNode* node) {
+      std::cout << "transformations.cpp void visit(const ForallNode* node)\n";
+      std::cout << "node: \n" << node << std::endl;
       Forall foralli(node);
+      std::cout << "foralli: \n" << foralli << std::endl;
+      std::cout << "before stmt update stmt: \n" << stmt << std::endl;
       IndexVar i = parallelize.geti();
 
       definedIndexVars.insert(foralli.getIndexVar());
@@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         Iterators iterators(foralli, tensorVars);
         MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, 
                                                   definedIndexVars);
+        std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl;
 
         // Precondition 2: No coiteration of modes (i.e., merge lattice has 
         //                 only one iterator)
@@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         MergeLattice underivedLattice = MergeLattice::make(underivedForall, 
                                                            iterators, provGraph, 
                                                            definedIndexVars);
+        std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl;
 
         // Precondition 3: Every result iterator must have insert capability
         for (Iterator iterator : underivedLattice.results()) {
@@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
             // build consumer that writes from temporary to output, mark consumer as parallel reduction
             ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction;
             if (should_use_CUDA_codegen()) {
+              std::cout << "should_use_CUDA_codegen() true\n";
               if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
                 reductionUnit = ParallelUnit::GPUWarpReduction;
               }
@@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
                 reductionUnit = ParallelUnit::GPUBlockReduction;
               }
             }
+            else {
+              std::cout << "should_use_CUDA_codegen() false\n";
+            }
             IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction);
             precomputed_stmt = where(consumer, producer);
           }
@@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
           return;
         }
 
-
+        std::cout << "updated stmt: \n";
         stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor());
+        std::cout << stmt << std::endl;
         return;
       }
 
@@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os,
 
 IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
   // get outer ForAll
+  std::cout << "get outer ForAll ----------------- \n";
   Forall forall;
   bool matched = false;
   match(stmt,
@@ -1215,7 +1241,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
     }
     return parallelized256;
   }
+  else if (should_use_ISPC_codegen()) {
+    std::cout << "outer loop parallelization for ISPC codegen\n";
+    // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason);
+    // if (parallelized == IndexStmt()) {
+    //   // can't parallelize
+    //   return stmt;
+    // }
+    // return parallelized;
+
+    return stmt;
+  }
   else {
+    std::cout << "outer loop parallelization for CPU codgen index statement\n";
     IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason);
     if (parallelized == IndexStmt()) {
       // can't parallelize
@@ -1274,6 +1312,7 @@ static vector<IndexVar>
 topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
                   map<IndexVar,multiset<IndexVar>> softDeps,
                   vector<IndexVar> originalOrder) {
+  std::cout << "originalOrder: " << std::endl;
   vector<IndexVar> sortedVars;
   unsigned long countVars = originalOrder.size();
   while (sortedVars.size() < countVars) {
@@ -1295,6 +1334,9 @@ topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
     }
 
     // No free var found there is a cycle
+    std::cout << "this is where the assert fails\n";
+    std::cout << "freeVarPos: " << freeVarPos << std::endl;
+    std::cout << "limit: " << std::numeric_limits<size_t>::max() << std::endl;
     taco_iassert(freeVarPos != std::numeric_limits<size_t>::max())
         << "Cycles in iteration graphs must be resolved, through transpose, "
         << "before the expression is passed to the topological sorting "
@@ -1320,8 +1362,674 @@ topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
   return sortedVars;
 }
 
+bool checkFromBack(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from back function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar lastVisitedIndexVar = resultIndexesVisited.back();
+
+  std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl;
+
+  bool onlyLastTensorContainLastIndexOfOutput = true;
+  bool fissionFromBack = false;
+
+  // check from the back
+  for (unsigned long i=0; i<tensorPaths.size()-1; i++) { // change tensor paths to recursively use the functionality
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    // if (i < tensorPaths.size()-1) { 
+      // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << lastVisitedIndexVar << endl;
+        if (index == lastVisitedIndexVar) {
+          onlyLastTensorContainLastIndexOfOutput = false;
+        }
+      }
+    // }
+  }
+
+  if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.back();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the back\n";
+    auto lastTensorLastVisited = indexesVisited.back();
+    cout << "last index last visited " << lastTensorLastVisited << endl;
+
+    if (lastTensorLastVisited == lastVisitedIndexVar) {
+      cout << "we can diffuse from the back\n";
+      fissionFromBack = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index
+          std::cout << "producer vars: " << indexVar << std::endl;
+          producerVars.push_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      // // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "resultIndexesVisited: " << resultIndexesVisited[j] << std::endl;
+      //   modifiedResultIndexesAccessed.push_back(resultIndexesVisited[j]);
+      // }
+      std::cout << "printing modifiedResultIndexesAccessed\n";
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+      std::cout << "printed modifiedResultIndexesAccessed\n";
+
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); ++it) {
+      //   cout << "modified index " << *it << ", last visited index var: "  << lastVisitedIndexVar << endl;
+      //   if (*it != lastVisitedIndexVar) {
+      //     std::cout << "modified index is not the last visited index variable\n";
+      //     modifiedResultIndexesAccessed.back() = *it;
+          
+      //   }
+      //   else {
+      //     cout << "modified index " << *it << " is the last visited index var " << lastVisitedIndexVar << endl;
+      //   }
+      // }
+      // for (unsigned long j=0; j<modifiedResultIndexesAccessed.size(); j++) {
+      //   std::cout << "modifiedResultIndexesAccessed: " << modifiedResultIndexesAccessed[j] << std::endl;
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+          find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+            != modifiedResultIndexesAccessed.end() ||
+          find(indexesVisited.begin(), indexesVisited.end(), indexVar) 
+            != indexesVisited.end()
+        ) {
+          std::cout << "consumer var: " << indexVar << std::endl;
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  }
+
+  return fissionFromBack;
+}
+
+bool checkFromFront(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from front function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar firstVisitedIndexVar = resultIndexesVisited.front();
+
+  std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl;
+  std::cout << "tensor path size: " << tensorPaths.size() << std::endl;
+
+  bool onlyFirstTensorContainFirstIndexOfOutput = true;
+  bool fissionFromFront = false;
+
+  // check from the front
+  for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality
+    std::cout << "i: " << i << std::endl;
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    if (i != 0) { // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << firstVisitedIndexVar << endl;
+        if (index == firstVisitedIndexVar) {
+          onlyFirstTensorContainFirstIndexOfOutput = false;
+        }
+      }
+    } 
+  }
+
+
+  if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.front();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the front\n";
+    auto firstTensorFirstVisited = indexesVisited.front();
+    cout << "first index first visited " << firstTensorFirstVisited << endl;
+
+    if (firstTensorFirstVisited == firstVisitedIndexVar) {
+      cout << "we can diffuse from the front\n";
+      fissionFromFront = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index
+          producerVars.emplace_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      std::cout << "printing modifiedResultIndexesAccessed\n";
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+      std::cout << "printed modifiedResultIndexesAccessed\n";
+
+      // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "modified result indexes accessed: " << resultIndexesVisited[j];
+      //   modifiedResultIndexesAccessed.emplace_back(resultIndexesVisited[j]);
+      // }
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); it++) {
+      //   cout << "modified index " << *it << endl;
+      //   if (*it != firstVisitedIndexVar) {
+      //     std::cout << "modifying the last index \n";
+      //     modifiedResultIndexesAccessed.front() = *it;
+      //   }
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+        find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+          != modifiedResultIndexesAccessed.end() ||
+        find(indexesVisited.begin(), indexesVisited.end(), indexVar) != indexesVisited.end()) {
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  } else {
+    std::cout << "fission from the front is not possible\n";
+  }
+
+  
+  return fissionFromFront;
+
+}
+
+
+// let's assume the user gives the removable index node and 
+// the removable expression from front or end
+
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, 
+  std::string side, int iters) {
+  std::cout << "executing travese operation written by me\n";
+
+  if (iters < 1) {
+    return stmt;
+  }
+
+  // IndexVar keeps i, j, k, l, values.
+  // so if we know what index to remove, the rest defines
+  // the order of the producer
+  struct SortedIndexVars : public IndexNotationVisitor {
+    using IndexNotationVisitor::visit;
+    map <IndexVar, ParallelUnit> forallParallelUnit;
+    map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+    vector<IndexVar> sortedIndexes;
+    Assignment innerBody;
+
+    SortedIndexVars() {};
+
+    void visit(const ForallNode* node) {
+      Forall forallNode(node);
+      IndexVar i = forallNode.getIndexVar();
+      std::cout << forallNode << std::endl;
+
+      sortedIndexes.push_back(i);
+      forallParallelUnit[i] = forallNode.getParallelUnit();
+      forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy();
+
+      if (isa<Assignment>(forallNode.getStmt())) {
+        cout << "assignment node found: " << forallNode.getStmt() << endl;;
+        innerBody = to<Assignment>(forallNode.getStmt());
+        return; // Only reorder first contiguous section of ForAlls
+      }
+
+      IndexNotationVisitor::visit(node);
+    }
+  };
+
+  std::cout << "traversing through the index statement\n";
+  SortedIndexVars sortedIndexVars;
+  stmt.accept(&sortedIndexVars);
+  std::cout << std::endl;
+
+  struct IndexExprBuilder : public IndexNotationVisitor {
+
+    using IndexNotationVisitor::visit;
+    vector<Access> accessLeftToRight;
+    map<IndexVar, vector<pair<Dimension,Type>>> indexDimensionsMap;
+
+    void visit(const AccessNode* node) {
+      Access accessNode(node);
+      std::cout << "access node: " << accessNode << std::endl;
+      accessLeftToRight.push_back(accessNode);
+
+      TensorVar tensorVar = accessNode.getTensorVar();
+
+      for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) {
+        auto var = accessNode.getIndexVars()[i];
+
+        if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) {
+          indexDimensionsMap[var].emplace_back(
+            pair<Dimension,Type>(tensorVar.getType().getShape().getDimension(i),
+            tensorVar.getType()));
+        }
+        else {
+          indexDimensionsMap[var] = {
+            pair<Dimension,Type>(
+              tensorVar.getType().getShape().getDimension(i),
+              tensorVar.getType())
+          };
+        }
+      }
+
+    }
+
+  };
+
+  IndexExpr rhsExpr = assignment.getRhs();
+  Access lhsAccess = to<Access>(assignment.getLhs());
+  std::cout << "right hand side expression: " << rhsExpr << std::endl;
+  IndexExprBuilder indexExprBuilder;
+  rhsExpr.accept(&indexExprBuilder);
+  TensorVar resultVar = lhsAccess.getTensorVar();
+
+  for (auto item : indexExprBuilder.indexDimensionsMap) {
+    auto indexVar = item.first;
+    cout << "var: " << indexVar << " ";
+    for (auto elem : item.second) {
+      cout << elem.first << " " << elem.second << " " ;
+    }
+    cout << endl;
+  }
+
+
+  // now I have the iteration graph
+  IterationGraph iterationGraph = IterationGraph::make(assignment);
+  std::cout << "/*******************************************/\n";
+  std::cout << "/********** ITERATION GRAPH ****************/\n";
+  std::cout << "/*******************************************/\n";
+  std::cout << iterationGraph << std::endl;
+
+  const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath();
+  const std::vector<TensorPath>& tensorPaths = iterationGraph.getTensorPaths();
+  
+
+  string removedAccessNode;
+  vector<IndexVar> producerVars; // producer accessed index variables
+  vector<IndexVar> consumerVars; // consumer accessed index variables
+  vector<IndexVar> fusedVars;
+  vector<IndexVar> modifiedResultIndexesAccessed;
+  bool fissionFromBack = false;
+  if (side == "b") {
+    fissionFromBack = true;
+  }
+
+  if (fissionFromBack) {
+    fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, 
+      removedAccessNode, producerVars, consumerVars,
+      modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+    );
+  }
+
+  bool fissionFromFront = false;
+  if (side == "f") {
+    fissionFromFront = true;
+  }
+  if (fissionFromBack == false && fissionFromFront) {
+      fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, 
+        removedAccessNode, producerVars, consumerVars,
+        modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+      );
+  }  
+
+  if (!fissionFromBack && !fissionFromFront) {
+    cout << "fission operation cannot be performed from the back\n";
+    return stmt;
+  }
+
+  vector<Dimension> newAccessDims{};
+  for (auto var : modifiedResultIndexesAccessed) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    newAccessDims.emplace_back(item[0].first);
+  }
+  TensorVar newAccessVar(resultVar.getName() + "_inner", 
+              Type(resultVar.getType().getDataType(), newAccessDims));
+  cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl;
+  Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed);
+  cout << "new access variable for iterative apply: " << newResultAccess << std::endl;
+
+  if (fissionFromBack) {
+    std::cout << "fission from the back is possible\n";
+  }
+  if (fissionFromFront) {
+    std::cout << "fission from the front is possible\n";
+  }
+
+  // // check from the front
+  // struct IndexExprSeparator : public IndexNotationVisitor {
+
+  //   using IndexNotationVisitor::visit;
+  //   vector<Access> accessLeftToRight;
+
+  //   void visit(const MulNode* node) {
+  //     Mul mulNode(node);
+  //     IndexExpr lhs = mulNode.getA();
+  //     IndexExpr rhs = mulNode.getB();
+  //     std::cout << "access node: " << accessNode << std::endl;
+  //     accessLeftToRight.push_back(accessNode);
+  //   }
+
+  // };
+
+
+  cout << "\n\nProducer accessed index variables\n";
+  auto it = producerVars.begin();
+  for (; it != producerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << "\n\nConsumer accessed index variables\n";
+  it = consumerVars.begin();
+  for (; it != consumerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << endl << endl;
+
+  // check common vars that can be fused
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+    find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) {
+      fusedVars.emplace_back(var);
+    }
+    else {
+      break;
+    }
+  }
+
+  for (auto& fv : fusedVars) {
+    std::cout << "fusable vars: " << fv << std::endl;
+  }
+
+  vector<IndexVar> sharedVars;
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() &&
+      find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+      find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()
+    ) {
+      sharedVars.emplace_back(var);
+    }
+  }
+
+  for (auto& sv : sharedVars) {
+    std::cout << "shared vars: " << sv << std::endl;
+  }
+
+  vector<Dimension> sharedDims{};
+  for (auto var : sharedVars) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    sharedDims.emplace_back(item[0].first);
+  }
+
+
+  // get removing tensorvars and workspace dimension
+  const Type& type = resultTensorPath.getAccess().getTensorVar().getType();
+  const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat();
+  TensorVar intermediateTensor("ws", type, format);
+  cout << intermediateTensor << endl;
+
+  // TensorVar A("A", Type(), taco::dense);
+  TensorVar tempVar("t" + resultVar.getName(), 
+                Type(resultVar.getType().getDataType(), sharedDims));
+  cout << "tensor order: " << tempVar.getOrder() << endl;
+  cout << "tensor format: " << tempVar.getFormat() << endl;
+  cout << "format order: " << tempVar.getFormat().getOrder() << endl;
+  
+  // TensorVar* a = new TensorVar("A", Type());
+  // TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  // get removing indexExpr and the rest of the indexExpr
+  Access workspace(tempVar, sharedVars);
+  std::cout << "workspace access tensor: " << workspace << std::endl;
+
+
+  
+  // construct producer expression right hand side
+  cout << "generating consumer expression\n";
+  IndexExpr producerExpr;
+  int num_muls = 0;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode != accessNode.getTensorVar().getName()) {
+      if (producerExpr == NULL) {
+        std::cout << "index expression is null";
+        producerExpr = accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      } else {
+        num_muls++;
+        producerExpr = producerExpr * accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      }
+    }
+  }
+  std::cout << producerExpr << std::endl;
+  Assignment producerAssignment(newResultAccess,
+    producerExpr);
+  std::cout << "new inner assignment statement: " << producerAssignment << std::endl;
+  Assignment producerInnerBody(workspace,
+    producerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+  std::cout << "producerInnerBody: " << producerInnerBody << std::endl;
+
+  // construct consumer expression right hand side
+  IndexExpr consumerExpr;
+  if (fissionFromBack) {
+    consumerExpr = workspace;
+  }
+  cout << "generating consumer expression: " << consumerExpr << std::endl;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    TensorVar tv = accessNode.getTensorVar();
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode == accessNode.getTensorVar().getName()) {
+      if (consumerExpr == NULL) {
+        std::cout << "index expression is null";
+        consumerExpr = accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      } else {
+        consumerExpr = consumerExpr * accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      }
+    }
+  }
+  if (fissionFromFront) {
+    consumerExpr = consumerExpr * workspace;
+  }
+  Assignment consumerInnerBody(lhsAccess,
+    consumerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+
+  cout << "Producer inner body: " << producerInnerBody << endl;
+  cout << "Consumer inner body: " << consumerInnerBody << endl;
+
+  // rewrite indexstmt
+  // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
+  struct ProducerConsumerRewriter : public IndexNotationRewriter {
+    using IndexNotationRewriter::visit;
+
+    const vector<IndexVar>& producerConsumerVars;
+    const vector<IndexVar>& fusedVars;
+    IndexStmt innerBody;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    ProducerConsumerRewriter(const vector<IndexVar>& producerConsumerVars, 
+                    const vector<IndexVar>& fusedVars, IndexStmt innerBody,
+                    const map <IndexVar, ParallelUnit> forallParallelUnit,
+                    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+        : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody),
+        forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy)  {
+    }
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+
+      // first forall must be in collected variables
+      // taco_iassert(util::contains(producerVars, i));
+      // std::cout << "\ninner body of the statement\n" << innerBody;
+      // // done in reverse order?
+      // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
+      //   stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
+      // }
+      stmt = rewrite(foralli.getStmt());
+      cout << "after rewrite statement: " << stmt << endl;
+
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() &&
+          find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) {
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+    }
+
+    void visit (const AssignmentNode* node) {
+      cout << "assignment node: " << node << endl;
+      stmt = innerBody;
+      cout << "producerStmt: " << innerBody << endl;
+      cout << "stmt: " << stmt << endl;
+    }
+
+  };
+  ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, 
+              producerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt producerStmt = producerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Producer rewriter\n";
+  std::cout << producerStmt << std::endl;
+  if (num_muls > 1) {
+    producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, 
+      side, iters-1);
+  }
+  
+
+  ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, 
+              consumerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt consumerStmt = consumerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Consumer rewriter\n";
+  std::cout << consumerStmt << std::endl;
+
+
+  struct CombineProducerConsumerRewriter : public IndexNotationRewriter {
+
+    const vector<IndexVar>& fusedVars;
+    IndexStmt consumerStmt;
+    IndexStmt producerStmt;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    CombineProducerConsumerRewriter(const vector<IndexVar>& fusedVars, 
+      IndexStmt producerStmt, IndexStmt consumerStmt, 
+      const map <IndexVar, ParallelUnit> forallParallelUnit,
+      const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+      : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt),
+      forallParallelUnit(forallParallelUnit), 
+      forallOutputRaceStrategy(forallOutputRaceStrategy) {}
+  
+    using IndexNotationRewriter::visit;
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+      
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) {
+        cout << "fused var in stmt\n";
+        stmt = rewrite(foralli.getStmt());
+        cout << "rewritten stmt: " << stmt << endl;
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+      else {
+        cout << "fused var not in  stmt\n";
+        cout << "producerStmt: " << producerStmt << endl;
+        cout << "consumerStmt: " << consumerStmt << endl;
+        stmt = where(consumerStmt, producerStmt);
+        cout << "where stmt: " << stmt << endl;
+      }
+
+      cout << "after rewrite statement: " << stmt << endl;
+    }
+  
+  };
+
+  CombineProducerConsumerRewriter combineRewriter(fusedVars, 
+              producerStmt, consumerStmt, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt combinedStmt = combineRewriter.rewrite(stmt);
+  std::cout << "\nAfter Combine rewriter\n";
+  std::cout << combinedStmt << std::endl;
+
+
+  return combinedStmt;
+  
+}
+
 
 IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
+  std::cout << "executing reorderLoopsTopologically\n";
   // Collect tensorLevelVars which stores the pairs of IndexVar and tensor
   // level that each tensor is accessed at
   struct DAGBuilder : public IndexNotationVisitor {
@@ -1382,8 +2090,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
 
   Iterators iterators(stmt);
+  std::cout << "DAG builder with iterators" << std::endl;
   DAGBuilder dagBuilder(iterators);
   stmt.accept(&dagBuilder);
+  std::cout << "After DAGBuilder\n";
+  std::cout << stmt << std::endl;
 
   // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars
   map<string, vector<pair<IndexVar, bool>>> tensorVarOrders;
@@ -1391,6 +2102,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
     tensorVarOrders[tensorLevelVar.first] = 
         varOrderFromTensorLevels(tensorLevelVar.second);
   }
+  // hard dependencies
   const auto hardDeps = depsFromVarOrders(tensorVarOrders);
 
   struct CollectSoftDependencies : public IndexNotationVisitor {
@@ -1412,12 +2124,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
       }
     }
   };
+  // soft dependencies
   CollectSoftDependencies collectSoftDeps;
   stmt.accept(&collectSoftDeps);
+  std::cout << "After CollectSoftDependencies\n";
+  std::cout << stmt << std::endl;
 
+  // topological sort
   const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, 
                                             dagBuilder.indexVarOriginalOrder);
 
+  // rewrite indexstmt
   // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
   struct TopoReorderRewriter : public IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -1440,7 +2157,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
 
       // first forall must be in collected variables
       taco_iassert(util::contains(sortedVars, i));
+      std::cout << "\ninner body of the statement\n" << innerBody;
       stmt = innerBody;
+      // done in reverse order?
       for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
         stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
       }
@@ -1450,7 +2169,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
   TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, 
                                dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy);
-  return rewriter.rewrite(stmt);
+  IndexStmt stmtChanged = rewriter.rewrite(stmt);
+  std::cout << "After TopoReorderRewriter\n";
+  std::cout << stmtChanged << std::endl;
+
+  return stmtChanged;
 }
 
 IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, 
@@ -1478,6 +2201,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph,
 
     void visit(const ForallNode* node) {
       Forall foralli(node);
+      std::cout << "scalar promote: " << foralli << std::endl;
       IndexVar i = foralli.getIndexVar();
 
       // Don't allow hoisting out of forall's for GPU warp and block reduction
diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp
index a1997a9b7..eddca3f29 100644
--- a/src/ir/ir_printer.cpp
+++ b/src/ir/ir_printer.cpp
@@ -1,6 +1,7 @@
 #include <sstream>
 #include <iostream>
 
+#include "taco/cuda.h"
 #include "taco/ir/ir.h"
 #include "taco/ir/ir_printer.h"
 #include "taco/ir/simplify.h"
@@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) {
 }
 
 IRPrinter::IRPrinter(ostream &s, bool color, bool simplify)
-    : stream(s), indent(0), color(color), simplify(simplify) {
+    : stream(s), stream2(s), indent(0), color(color), simplify(simplify) {
+}
+
+IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify)
+    : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) {
 }
 
 IRPrinter::~IRPrinter() {
@@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) {
 }
 
 void IRPrinter::visit(const Literal* op) {
-  if (color) {
-    stream << blue ;
-  }
-
-  switch (op->type.getKind()) {
-    case Datatype::Bool:
-      stream << op->getValue<bool>();
-    break;
-    case Datatype::UInt8:
-      stream << static_cast<uint16_t>(op->getValue<uint8_t>());
-    break;
-    case Datatype::UInt16:
-      stream << op->getValue<uint16_t>();
-    break;
-    case Datatype::UInt32:
-      stream << op->getValue<uint32_t>();
-    break;
-    case Datatype::UInt64:
-      stream << op->getValue<uint64_t>();
-    break;
-    case Datatype::UInt128:
-      taco_not_supported_yet;
-    break;
-    case Datatype::Int8:
-      stream << static_cast<int16_t>(op->getValue<int8_t>());
-    break;
-    case Datatype::Int16:
-      stream << op->getValue<int16_t>();
-    break;
-    case Datatype::Int32:
-      stream << op->getValue<int32_t>();
-    break;
-    case Datatype::Int64:
-      stream << op->getValue<int64_t>();
-    break;
-    case Datatype::Int128:
-      taco_not_supported_yet;
-    break;
-    case Datatype::Float32:
-      stream << ((op->getValue<float>() != 0.0)
-                 ? util::toString(op->getValue<float>()) : "0.0");
-    break;
-    case Datatype::Float64:
-      stream << ((op->getValue<double>()!=0.0)
-                 ? util::toString(op->getValue<double>()) : "0.0");
-    break;
-    case Datatype::Complex64: {
-      std::complex<float> val = op->getValue<std::complex<float>>();
-      stream << val.real() << " + I*" << val.imag();
-    }
-    break;
-    case Datatype::Complex128: {
-      std::complex<double> val = op->getValue<std::complex<double>>();
-      stream << val.real() << " + I*" << val.imag();
+  if (is_ISPC_code_stream_enabled()) {
+    if (color) {
+        stream2 << blue ;
+      }
+
+      // It seems this is where all the types get printed in the final code generation.
+      // Come up with a way to generate different values if stream2 is used to generate ispc code
+      switch (op->type.getKind()) {
+        case Datatype::Bool:
+          stream2 << op->getValue<bool>();
+        break;
+        case Datatype::UInt8:
+          stream2 << static_cast<uint16_t>(op->getValue<uint8_t>());
+        break;
+        case Datatype::UInt16:
+          stream2 << op->getValue<uint16_t>();
+        break;
+        case Datatype::UInt32:
+          stream2 << op->getValue<uint32_t>();
+        break;
+        case Datatype::UInt64:
+          stream2 << op->getValue<uint64_t>();
+        break;
+        case Datatype::UInt128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Int8:
+          stream2 << static_cast<int16_t>(op->getValue<int8_t>());
+        break;
+        case Datatype::Int16:
+          stream2 << op->getValue<int16_t>();
+        break;
+        case Datatype::Int32:
+          stream2 << op->getValue<int32_t>();
+        break;
+        case Datatype::Int64:
+          stream2 << op->getValue<int64_t>();
+        break;
+        case Datatype::Int128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Float32:
+          stream2 << ((op->getValue<float>() != 0.0)
+                    ? util::toString(op->getValue<float>()) : "0.0");
+        break;
+        case Datatype::Float64:
+          stream2 << ((op->getValue<double>()!=0.0)
+                    ? util::toString(op->getValue<double>()) : "0.0");
+        break;
+        case Datatype::Complex64: {
+          std::complex<float> val = op->getValue<std::complex<float>>();
+          stream2 << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Complex128: {
+          std::complex<double> val = op->getValue<std::complex<double>>();
+          stream2 << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Undefined:
+          taco_ierror << "Undefined type in IR";
+        break;
+      }
+
+      if (color) {
+        stream2 << nc;
+      }
     }
-    break;
-    case Datatype::Undefined:
-      taco_ierror << "Undefined type in IR";
-    break;
-  }
 
-  if (color) {
-    stream << nc;
+
+
+  else {
+
+    if (color) {
+        stream << blue ;
+      }
+
+      // It seems this is where all the types get printed in the final code generation.
+      // Come up with a way to generate different values if stream2 is used to generate ispc code
+      switch (op->type.getKind()) {
+        case Datatype::Bool:
+          stream << op->getValue<bool>();
+        break;
+        case Datatype::UInt8:
+          stream << static_cast<uint16_t>(op->getValue<uint8_t>());
+        break;
+        case Datatype::UInt16:
+          stream << op->getValue<uint16_t>();
+        break;
+        case Datatype::UInt32:
+          stream << op->getValue<uint32_t>();
+        break;
+        case Datatype::UInt64:
+          stream << op->getValue<uint64_t>();
+        break;
+        case Datatype::UInt128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Int8:
+          stream << static_cast<int16_t>(op->getValue<int8_t>());
+        break;
+        case Datatype::Int16:
+          stream << op->getValue<int16_t>();
+        break;
+        case Datatype::Int32:
+          stream << op->getValue<int32_t>();
+        break;
+        case Datatype::Int64:
+          stream << op->getValue<int64_t>();
+        break;
+        case Datatype::Int128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Float32:
+          stream << ((op->getValue<float>() != 0.0)
+                    ? util::toString(op->getValue<float>()) : "0.0");
+        break;
+        case Datatype::Float64:
+          stream << ((op->getValue<double>()!=0.0)
+                    ? util::toString(op->getValue<double>()) : "0.0");
+        break;
+        case Datatype::Complex64: {
+          std::complex<float> val = op->getValue<std::complex<float>>();
+          stream << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Complex128: {
+          std::complex<double> val = op->getValue<std::complex<double>>();
+          stream << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Undefined:
+          taco_ierror << "Undefined type in IR";
+        break;
+      }
+
+      if (color) {
+        stream << nc;
+      }
+
+    
   }
+  
 }
 
 void IRPrinter::visit(const Var* op) {
-  if (varNames.contains(op)) {
-    stream << varNames.get(op);
+  if (is_ISPC_code_stream_enabled()) {
+    if (varNames.contains(op)) {
+      stream2 << varNames.get(op);
+    }
+    else {
+      stream2 << op->name;
+    }
   }
   else {
-    stream << op->name;
+    if (varNames.contains(op)) {
+      stream << varNames.get(op);
+    }
+    else {
+      stream << op->name;
+    }
   }
+
 }
 
 void IRPrinter::visit(const Neg* op) {
@@ -238,51 +333,101 @@ void IRPrinter::visit(const Cast* op) {
 }
 
 void IRPrinter::visit(const Call* op) {
-  stream << op->func << "(";
-  parentPrecedence = Precedence::CALL;
-  acceptJoin(this, stream, op->args, ", ");
-  stream << ")";
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << op->func << "(";
+    parentPrecedence = Precedence::CALL;
+    acceptJoin(this, stream, op->args, ", ");
+    stream << ")";
+  } else {
+    // statically added function to the ispc file has __ in the front
+    stream2 << "__" << op->func << "(";
+    parentPrecedence = Precedence::CALL;
+    acceptJoin(this, stream2, op->args, ", ");
+    stream2 << ")";
+  }
 }
 
 void IRPrinter::visit(const IfThenElse* op) {
   taco_iassert(op->cond.defined());
   taco_iassert(op->then.defined());
   doIndent();
-  stream << keywordString("if ");
-  stream << "(";
-  parentPrecedence = Precedence::TOP;
-  op->cond.accept(this);
-  stream << ")";
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << keywordString("if ");
+    stream2 << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream2 << ")";
+
+    Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
+    if (isa<Block>(scopedStmt)) {
+      stream2 << " {" << endl;
+      op->then.accept(this);
+      doIndent();
+      stream2 << "}";
+    }
+    else if (isa<Assign>(scopedStmt)) {
+      int tmp = indent;
+      indent = 0;
+      stream2 << " ";
+      scopedStmt.accept(this);
+      indent = tmp;
+    }
+    else {
+      stream2 << endl;
+      op->then.accept(this);
+    }
 
-  Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
-  if (isa<Block>(scopedStmt)) {
-    stream << " {" << endl;
-    op->then.accept(this);
-    doIndent();
-    stream << "}";
-  }
-  else if (isa<Assign>(scopedStmt)) {
-    int tmp = indent;
-    indent = 0;
-    stream << " ";
-    scopedStmt.accept(this);
-    indent = tmp;
+    if (op->otherwise.defined()) {
+      stream2 << "\n";
+      doIndent();
+      stream2 << keywordString("else");
+      stream2 << " {\n";
+      op->otherwise.accept(this);
+      doIndent();
+      stream2 << "}";
+    }
+    stream2 << endl;    
   }
+
+
   else {
-    stream << endl;
-    op->then.accept(this);
-  }
+    stream << keywordString("if ");
+    stream << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream << ")";
 
-  if (op->otherwise.defined()) {
-    stream << "\n";
-    doIndent();
-    stream << keywordString("else");
-    stream << " {\n";
-    op->otherwise.accept(this);
-    doIndent();
-    stream << "}";
+    Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
+    if (isa<Block>(scopedStmt)) {
+      stream << " {" << endl;
+      op->then.accept(this);
+      doIndent();
+      stream << "}";
+    }
+    else if (isa<Assign>(scopedStmt)) {
+      int tmp = indent;
+      indent = 0;
+      stream << " ";
+      scopedStmt.accept(this);
+      indent = tmp;
+    }
+    else {
+      stream << endl;
+      op->then.accept(this);
+    }
+
+    if (op->otherwise.defined()) {
+      stream << "\n";
+      doIndent();
+      stream << keywordString("else");
+      stream << " {\n";
+      op->otherwise.accept(this);
+      doIndent();
+      stream << "}";
+    }
+    stream << endl;    
   }
-  stream << endl;
+
 }
 
 void IRPrinter::visit(const Case* op) {
@@ -345,12 +490,22 @@ void IRPrinter::visit(const Switch* op) {
 }
 
 void IRPrinter::visit(const Load* op) {
-  parentPrecedence = Precedence::LOAD;
-  op->arr.accept(this);
-  stream << "[";
-  parentPrecedence = Precedence::LOAD;
-  op->loc.accept(this);
-  stream << "]";
+  if (is_ISPC_code_stream_enabled()) {
+    parentPrecedence = Precedence::LOAD;
+    op->arr.accept(this);
+    stream2 << "[";
+    parentPrecedence = Precedence::LOAD;
+    op->loc.accept(this);
+    stream2 << "]";    
+  }
+  else {
+    parentPrecedence = Precedence::LOAD;
+    op->arr.accept(this);
+    stream << "[";
+    parentPrecedence = Precedence::LOAD;
+    op->loc.accept(this);
+    stream << "]";   
+  }
 }
 
 void IRPrinter::visit(const Malloc* op) {
@@ -367,66 +522,149 @@ void IRPrinter::visit(const Sizeof* op) {
 }
 
 void IRPrinter::visit(const Store* op) {
-  doIndent();
-  op->arr.accept(this);
-  stream << "[";
-  parentPrecedence = Precedence::TOP;
-  op->loc.accept(this);
-  stream << "] = ";
-  parentPrecedence = Precedence::TOP;
-  op->data.accept(this);
-  stream << ";";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    op->arr.accept(this);
+    stream2 << "[";
+    parentPrecedence = Precedence::TOP;
+    op->loc.accept(this);
+    stream2 << "] = ";
+    parentPrecedence = Precedence::TOP;
+    op->data.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    op->arr.accept(this);
+    stream << "[";
+    parentPrecedence = Precedence::TOP;
+    op->loc.accept(this);
+    stream << "] = ";
+    parentPrecedence = Precedence::TOP;
+    op->data.accept(this);
+    stream << ";";
+    stream << endl;
+  }
+
 }
 
 void IRPrinter::visit(const For* op) {
-  doIndent();
-  stream << keywordString("for") << " (" 
-         << keywordString(util::toString(op->var.type())) << " ";
-  op->var.accept(this);
-  stream << " = ";
-  op->start.accept(this);
-  stream << keywordString("; ");
-  op->var.accept(this);
-  stream << " < ";
-  parentPrecedence = BOTTOM;
-  op->end.accept(this);
-  stream << keywordString("; ");
-  op->var.accept(this);
+  // std::cout << "This is IRPrinter::visit For op method\n";
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << keywordString("for") << " (" 
+          << keywordString(util::toString(op->var.type())) << " ";
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+    stream2 << ") {\n";
 
-  auto lit = op->increment.as<Literal>();
-  if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                         (lit->type.isUInt() && lit->equalsScalar(1)))) {
-    stream << "++";
+    op->contents.accept(this);
+    doIndent();
+    stream2 << "}";
+    stream2 << endl;
   }
+  
+  
   else {
-    stream << " += ";
-    op->increment.accept(this);
+    doIndent();
+    stream << keywordString("for") << " (" 
+          << keywordString(util::toString(op->var.type())) << " ";
+    op->var.accept(this);
+    stream << " = ";
+    op->start.accept(this);
+    stream << keywordString("; ");
+    op->var.accept(this);
+    stream << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream << "++";
+    }
+    else {
+      stream << " += ";
+      op->increment.accept(this);
+    }
+    stream << ") {\n";
+
+    op->contents.accept(this);
+    doIndent();
+    stream << "}";
+    stream << endl;    
   }
-  stream << ") {\n";
 
-  op->contents.accept(this);
-  doIndent();
-  stream << "}";
-  stream << endl;
+}
+
+void IRPrinter::sendToStream(std::stringstream &stream) {
+  if (is_ISPC_code_stream_enabled()) {
+    this->stream2 << stream.str();
+  }
+  else {
+    this->stream << stream.str();
+  }
 }
 
 void IRPrinter::visit(const While* op) {
-  doIndent();
-  stream << keywordString("while ");
-  stream << "(";
-  parentPrecedence = Precedence::TOP;
-  op->cond.accept(this);
-  stream << ")";
-  stream << " {\n";
-  op->contents.accept(this);
-  doIndent();
-  stream << "}";
-  stream << endl;
+  // std::stringstream stream;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << keywordString("while ");
+    stream2 << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream2 << ")";
+    stream2 << " {\n";
+    op->contents.accept(this);
+    doIndent();
+    stream2 << "}";
+    stream2 << endl;    
+  }
+  else {
+    doIndent();
+    stream << keywordString("while ");
+    stream << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream << ")";
+    stream << " {\n";
+    op->contents.accept(this);
+    doIndent();
+    stream << "}";
+    stream << endl;
+  }
+  // sendToStream(stream);
 }
 
 void IRPrinter::visit(const Block* op) {
-  acceptJoin(this, stream, op->contents, "");
+  if (is_ISPC_code_stream_enabled()) {
+    acceptJoin(this, stream2, op->contents, "");
+  }
+  else {
+    acceptJoin(this, stream, op->contents, "");
+  }
 }
 
 void IRPrinter::visit(const Scope* op) {
@@ -438,85 +676,140 @@ void IRPrinter::visit(const Scope* op) {
 }
 
 void IRPrinter::visit(const Function* op) {
-  stream << keywordString("void ") << op->name;
-  stream << "(";
-  if (op->outputs.size() > 0) stream << "Tensor ";
-  acceptJoin(this, stream, op->outputs, ", Tensor ");
-  if (op->outputs.size() > 0 && op->inputs.size()) stream << ", ";
-  if (op->inputs.size() > 0) stream << "Tensor ";
-  acceptJoin(this, stream, op->inputs, ", Tensor ");
-  stream << ") {" << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << keywordString("void ") << op->name;
+    stream2 << "(";
+    if (op->outputs.size() > 0) stream2 << "Tensor ";
+    acceptJoin(this, stream2, op->outputs, ", Tensor ");
+    if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", ";
+    if (op->inputs.size() > 0) stream2 << "Tensor ";
+    acceptJoin(this, stream2, op->inputs, ", Tensor ");
+    stream2 << ") {" << endl;
+
+    resetNameCounters();
+    op->body.accept(this);
 
-  resetNameCounters();
-  op->body.accept(this);
+    doIndent();
+    stream2 << "}";
+  }
+  else {
+    stream << keywordString("void ") << op->name;
+    stream << "(";
+    if (op->outputs.size() > 0) stream << "Tensor ";
+    acceptJoin(this, stream, op->outputs, ", Tensor ");
+    if (op->outputs.size() > 0 && op->inputs.size()) stream << ", ";
+    if (op->inputs.size() > 0) stream << "Tensor ";
+    acceptJoin(this, stream, op->inputs, ", Tensor ");
+    stream << ") {" << endl;
+
+    resetNameCounters();
+    op->body.accept(this);
+
+    doIndent();
+    stream << "}";
+  }
 
-  doIndent();
-  stream << "}";
 }
 
 void IRPrinter::visit(const VarDecl* op) {
-  doIndent();
-  stream << keywordString(util::toString(op->var.type()));
-  taco_iassert(isa<Var>(op->var));
-  if (to<Var>(op->var)->is_ptr) {
-    stream << "* restrict";
-  }
-  stream << " ";
-  string varName = varNameGenerator.getUniqueName(util::toString(op->var));
-  varNames.insert({op->var, varName});
-  op->var.accept(this);
-  parentPrecedence = Precedence::TOP;
-  stream << " = ";
-  op->rhs.accept(this);
-  stream << ";";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    if (op->var.type() == Int32) {
+      stream2 << keywordString("int32");
+    }
+    else if (op->var.type() == Int64) {
+      stream2 << keywordString("int64");
+    } else {
+      stream2 << keywordString(util::toString(op->var.type()));
+    }
+    taco_iassert(isa<Var>(op->var));
+    if (to<Var>(op->var)->is_ptr) {
+      stream2 << "* "; // removed restrict keyword from here
+    }
+    stream2 << " ";
+    string varName = varNameGenerator.getUniqueName(util::toString(op->var));
+    varNames.insert({op->var, varName});
+    op->var.accept(this);
+    parentPrecedence = Precedence::TOP;
+    stream2 << " = ";
+    op->rhs.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    stream << keywordString(util::toString(op->var.type()));
+    taco_iassert(isa<Var>(op->var));
+    if (to<Var>(op->var)->is_ptr) {
+      stream << "* restrict";
+    }
+    stream << " ";
+    string varName = varNameGenerator.getUniqueName(util::toString(op->var));
+    varNames.insert({op->var, varName});
+    op->var.accept(this);
+    parentPrecedence = Precedence::TOP;
+    stream << " = ";
+    op->rhs.accept(this);
+    stream << ";";
+    stream << endl;
+  }
+
 }
 
 void IRPrinter::visit(const Assign* op) {
-  doIndent();
-  op->lhs.accept(this);
-  parentPrecedence = Precedence::TOP;
-  bool printed = false;
-  if (simplify) {
-    if (isa<ir::Add>(op->rhs)) {
-      auto add = to<Add>(op->rhs);
-      if (add->a == op->lhs) {
-        const Literal* lit = add->b.as<Literal>();
-        if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                               (lit->type.isUInt() && lit->equalsScalar(1)))) {
-          stream << "++";
+  if (is_ISPC_code_stream_enabled()) {
+
+  }
+  
+  
+  
+  else {
+    doIndent();
+    op->lhs.accept(this);
+    parentPrecedence = Precedence::TOP;
+    bool printed = false;
+    if (simplify) {
+      if (isa<ir::Add>(op->rhs)) {
+        auto add = to<Add>(op->rhs);
+        if (add->a == op->lhs) {
+          const Literal* lit = add->b.as<Literal>();
+          if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                                (lit->type.isUInt() && lit->equalsScalar(1)))) {
+            stream << "++";
+          }
+          else {
+            stream << " += ";
+            add->b.accept(this);
+          }
+          printed = true;
         }
-        else {
-          stream << " += ";
-          add->b.accept(this);
+      }
+      else if (isa<Mul>(op->rhs)) {
+        auto mul = to<Mul>(op->rhs);
+        if (mul->a == op->lhs) {
+          stream << " *= ";
+          mul->b.accept(this);
+          printed = true;
         }
-        printed = true;
       }
-    }
-    else if (isa<Mul>(op->rhs)) {
-      auto mul = to<Mul>(op->rhs);
-      if (mul->a == op->lhs) {
-        stream << " *= ";
-        mul->b.accept(this);
-        printed = true;
+      else if (isa<BitOr>(op->rhs)) {
+        auto bitOr = to<BitOr>(op->rhs);
+        if (bitOr->a == op->lhs) {
+          stream << " |= ";
+          bitOr->b.accept(this);
+          printed = true;
+        }
       }
     }
-    else if (isa<BitOr>(op->rhs)) {
-      auto bitOr = to<BitOr>(op->rhs);
-      if (bitOr->a == op->lhs) {
-        stream << " |= ";
-        bitOr->b.accept(this);
-        printed = true;
-      }
+    if (!printed) {
+      stream << " = ";
+      op->rhs.accept(this);
     }
-  }
-  if (!printed) {
-    stream << " = ";
-    op->rhs.accept(this);
+
+    stream << ";";
+    stream << endl;    
   }
 
-  stream << ";";
-  stream << endl;
 }
 
 void IRPrinter::visit(const Yield* op) {
@@ -544,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) {
 }
 
 void IRPrinter::visit(const Free* op) {
-  doIndent();
-  stream << "free(";
-  parentPrecedence = Precedence::TOP;
-  op->var.accept(this);
-  stream << ");";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << "delete[] ";
+    parentPrecedence = Precedence::TOP;
+    op->var.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    stream << "free(";
+    parentPrecedence = Precedence::TOP;
+    op->var.accept(this);
+    stream << ");";
+    stream << endl;
+  }
 }
 
 void IRPrinter::visit(const Comment* op) {
@@ -559,17 +862,32 @@ void IRPrinter::visit(const Comment* op) {
 }
 
 void IRPrinter::visit(const BlankLine*) {
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << endl;
+  } 
+  else {
+    stream << endl;
+  }
 }
 
 void IRPrinter::visit(const Continue*) {
   doIndent();
-  stream << "continue;" << endl;
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << "continue;" << endl;
+  }
+  else {
+    stream2 << "continue;" << endl;
+  }
 }
 
 void IRPrinter::visit(const Break*) {
   doIndent();
-  stream << "break;" << endl;
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << "break;" << endl;
+  }
+  else {
+    stream2 << "break;" << endl;
+  }
 }
 
 void IRPrinter::visit(const Print* op) {
@@ -585,7 +903,12 @@ void IRPrinter::visit(const Print* op) {
 }
 
 void IRPrinter::visit(const GetProperty* op) {
-  stream << op->name;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << op->name;
+  }
+  else {
+    stream << op->name;
+  }
 }
 
 void IRPrinter::visit(const Sort* op) {
@@ -643,23 +966,47 @@ void IRPrinter::resetNameCounters() {
 }
 
 void IRPrinter::doIndent() {
-  for (int i=0; i<indent; i++)
-    stream << "  ";
+  if (is_ISPC_code_stream_enabled()) {
+    for (int i=0; i<indent; i++)
+      stream2 << "  ";  
+  }
+  else {
+    for (int i=0; i<indent; i++)
+      stream << "  ";
+  }
+
 }
 
 void IRPrinter::printBinOp(Expr a, Expr b, string op, Precedence precedence) {
-  bool parenthesize = needsParentheses(precedence);
-  if (parenthesize) {
-    stream << "(";
+  if (is_ISPC_code_stream_enabled()) {
+    bool parenthesize = needsParentheses(precedence);
+    if (parenthesize) {
+      stream2 << "(";
+    }
+    parentPrecedence = precedence;
+    a.accept(this);
+    stream2 << " " << op << " ";
+    parentPrecedence = precedence;
+    b.accept(this);
+    if (parenthesize) {
+      stream2 << ")";
+    }
   }
-  parentPrecedence = precedence;
-  a.accept(this);
-  stream << " " << op << " ";
-  parentPrecedence = precedence;
-  b.accept(this);
-  if (parenthesize) {
-    stream << ")";
+  else {
+    bool parenthesize = needsParentheses(precedence);
+    if (parenthesize) {
+      stream << "(";
+    }
+    parentPrecedence = precedence;
+    a.accept(this);
+    stream << " " << op << " ";
+    parentPrecedence = precedence;
+    b.accept(this);
+    if (parenthesize) {
+      stream << ")";
+    }
   }
+
 }
 
 bool IRPrinter::needsParentheses(Precedence precedence) {
diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp
index eed6f2bab..2e4827497 100644
--- a/src/ir/ir_rewriter.cpp
+++ b/src/ir/ir_rewriter.cpp
@@ -292,6 +292,7 @@ void IRRewriter::visit(const Store* op) {
 }
 
 void IRRewriter::visit(const For* op) {
+  // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl;
   Expr var       = rewrite(op->var);
   Expr start     = rewrite(op->start);
   Expr end       = rewrite(op->end);
diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp
index af3dbd775..e7365d6c2 100644
--- a/src/ir_tags.cpp
+++ b/src/ir_tags.cpp
@@ -2,7 +2,7 @@
 
 namespace taco {
 
-const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
+const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"};
 const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
 const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
 const char *AssembleStrategy_NAMES[] = {"Append", "Insert"};
diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp
index 77735a8d2..482d84aae 100644
--- a/src/lower/iteration_graph.cpp
+++ b/src/lower/iteration_graph.cpp
@@ -48,6 +48,8 @@ struct IterationGraph::Content {
 IterationGraph::IterationGraph() {
 }
 
+// remember that iteration graph does not have an ordering
+// I got the ordering from topologically reorder index Ryan wrote
 IterationGraph IterationGraph::make(Assignment assignment) {
   TensorVar tensor = assignment.getLhs().getTensorVar();
   IndexExpr expr = assignment.getRhs();
@@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) {
     oldToSplitVar.insert({indexVar, indexVar});
   }
 
+  // access nodes of right hand side
   match(expr,
     function<void(const AccessNode*)>([&](const AccessNode* op) {
+      std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl;
+      std::cout << "index var: ";
+      for (auto indexVar : op->indexVars) {
+        std::cout << indexVar << " ";
+      }
+      std::cout << std::endl;
+      
       auto type = op->tensorVar.getType();
       taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size())
           << "Tensor access " << IndexExpr(op) << " but tensor format only has "
diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp
index 0f0c024c5..eb3d8ac3b 100644
--- a/src/lower/iterator.cpp
+++ b/src/lower/iterator.cpp
@@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI
                                       ProvenanceGraph provGraph,
                                       const map<TensorVar, Expr> &tensorVars) {
   TensorVar tensorConcrete = access.getTensorVar();
+  cout << "tensor: " << tensorConcrete << " " ;
+  cout << "tensorConcrete order: " << tensorConcrete.getOrder();
+  cout << ", format order: " << format.getOrder() << endl;
   taco_iassert(tensorConcrete.getOrder() == format.getOrder())
       << tensorConcrete << ", Format" << format;
   Shape shape = tensorConcrete.getType().getShape();
diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp
index b4c9ea710..1355c80a1 100644
--- a/src/lower/lowerer_impl_imperative.cpp
+++ b/src/lower/lowerer_impl_imperative.cpp
@@ -1,4 +1,6 @@
 #include <taco/lower/mode_format_compressed.h>
+#include "taco/cuda.h"
+#include "taco/ir_tags.h"
 #include "taco/lower/lowerer_impl_imperative.h"
 #include "taco/lower/lowerer_impl.h"
 
@@ -26,6 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict {
 public:
   Visitor(LowererImplImperative* impl) : impl(impl) {}
   Stmt lower(IndexStmt stmt) {
+    // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl;
     this->stmt = Stmt();
     impl->accessibleIterators.scope();
     IndexStmtVisitorStrict::visit(stmt);
@@ -135,6 +138,7 @@ static bool returnsTrue(IndexExpr expr) {
     }
 
     void visit(const CastNode* op) {
+      std::cout << "visiting cast node\n";
       expr = rewrite(op->a);
     }
 
@@ -200,6 +204,7 @@ static std::set<Expr> hasSparseInserts(IndexStmt stmt, Iterators iterators,
   return ret;
 }
 
+
 Stmt
 LowererImplImperative::lower(IndexStmt stmt, string name,
                    bool assemble, bool compute, bool pack, bool unpack)
@@ -414,6 +419,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name,
 
 Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
 {
+  // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n";
   taco_iassert(generateAssembleCode() || generateComputeCode());
 
   Stmt computeStmt;
@@ -421,7 +427,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
   Expr var = getTensorVar(result);
 
   const bool needComputeAssign = util::contains(needCompute, result);
-
+  // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl;
   Expr rhs;
   if (needComputeAssign) {
     rhs = lower(assignment.getRhs());
@@ -429,20 +435,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
 
   // Assignment to scalar variables.
   if (isScalar(result.getType())) {
+    // std::cout << "assignment to scalar variables\n";
     if (needComputeAssign) {
+      // std::cout << "compute assign\n";
       if (!assignment.getOperator().defined()) {
+        // std::cout << "assignment operator is not defined\n";
+        // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl;
         computeStmt = Assign::make(var, rhs);
       }
       else {
         taco_iassert(isa<taco::Add>(assignment.getOperator()));
-        bool useAtomics = markAssignsAtomicDepth > 0 &&
-                          !util::contains(whereTemps, result);
+        
+        // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl;
+        // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl;
+        // for (auto &tensors_ : whereTemps) {
+        //   // std::cout << tensors_ << ", ";
+        // }  
+        // std::cout << std::endl;
+        // std::cout << result << std::endl;
+        int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second;
+        // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl;
+        
+        bool reduction = false;
+        std::map<int, ParallelUnit>::iterator itr;
+        for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) {
+          if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) {
+            reduction = true;
+          }
+          // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl;
+        }
+
+        // less than or equal to loopDepth but greater than temp variable initialized loop depth
+        bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction);
+        // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl;
+        // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl;
         computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit);
+        // std::cout << "computeStatment: " << computeStmt << std::endl;
       }
     }
+    else {
+      // std::cout << "not compute assign\n";
+    }
   }
   // Assignments to tensor variables (non-scalar).
   else {
+    // std::cout << "assignment to tensor variables\n";
     Expr values = getValuesArray(result);
     Expr loc = generateValueLocExpr(assignment.getLhs());
 
@@ -476,6 +513,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
     }
 
     if (needComputeAssign && values.defined()) {
+      // std::cout << "assign compute statement\n";
       if (!assignment.getOperator().defined()) {
         computeStmt = Store::make(values, loc, rhs);
       }
@@ -586,19 +624,39 @@ LowererImplImperative::splitAppenderAndInserters(const vector<Iterator>& results
 }
 
 
+// important function
+/*
+*  This is the for loop lowering part
+*/
+
 Stmt LowererImplImperative::lowerForall(Forall forall)
 {
+  loopDepth++;
+  forUnits.insert(std::pair<int, ParallelUnit>(loopDepth,forall.getParallelUnit()));
+  // std::cout << "doing lowerForall: " << forall << std::endl;
   bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar());
   bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards;
+
+
+  // std::cout << "printing temporary variables with their atomic depths\n";
+  map<TensorVar, int>::iterator itr;
+  for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) {
+    // std::cout << itr->first << "\t" << itr->second << "\n";
+  }
+
+
   if (!ignoreVectorize && forallNeedsUnderivedGuards &&
       (forall.getParallelUnit() == ParallelUnit::CPUVector ||
        forall.getUnrollFactor() > 0)) {
+    // std::cout << "calling lowerForallCloned(forall)\n";
     return lowerForallCloned(forall);
   }
 
+  // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n";
   if (forall.getParallelUnit() != ParallelUnit::NotParallel) {
     inParallelLoopDepth++;
   }
+  // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n";
 
   // Recover any available parents that were not recoverable previously
   vector<Stmt> recoverySteps;
@@ -786,19 +844,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     }
 
     if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) {
+      // std::cout << "calling lowerForallFusedPosition(forall\n";
       loops = lowerForallFusedPosition(forall, iterator, locators,
                                          inserters, appenders, reducedAccesses, recoveryStmt);
     }
     else if (canAccelWithSparseIteration) {
+      // std::cout << "calling lowerForallDenseAcceleration(forall\n";
       loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt);
     }
     // Emit dimension coordinate iteration loop
     else if (iterator.isDimensionIterator()) {
+      // std::cout << "calling lowerForallDimension(forall\n";
       loops = lowerForallDimension(forall, point.locators(),
                                    inserters, appenders, reducedAccesses, recoveryStmt);
     }
     // Emit position iteration loop
     else if (iterator.hasPosIter()) {
+      // std::cout << "calling lowerForallPosition(forall\n";
       loops = lowerForallPosition(forall, iterator, locators,
                                     inserters, appenders, reducedAccesses, recoveryStmt);
     }
@@ -816,6 +878,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     loops = lowerMergeLattice(lattice, underivedAncestors[0],
                               forall.getStmt(), reducedAccesses);
   }
+
+  // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n";
+  // std::cout << loops << std::endl;
+  // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n";
 //  taco_iassert(loops.defined());
 
   if (!generateComputeCode() && !hasStores(loops)) {
@@ -832,6 +898,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     parallelUnitIndexVars.erase(forall.getParallelUnit());
     parallelUnitSizes.erase(forall.getParallelUnit());
   }
+  
+  forUnits.erase(loopDepth);
+  loopDepth--;
   return Block::blanks(preInitValues,
                        temporaryValuesInitFree[0],
                        loops,
@@ -1136,13 +1205,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
                                        set<Access> reducedAccesses,
                                        ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n";
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl;
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
 
   if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {
     markAssignsAtomicDepth++;
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n";
+    // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl;
     atomicParallelUnit = forall.getParallelUnit();
   }
+  else {
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n";
+  }
 
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1158,7 +1236,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
   std::vector<ir::Expr> bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
 
   LoopKind kind = LoopKind::Serial;
-  if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
+  if (should_use_ISPC_codegen()) {
+    // std::cout << "Foreach compatible loop\n";
+    if (forall.getParallelUnit() == ParallelUnit::CPUSimd) {
+      kind = LoopKind::Foreach;
+    }
+    else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd 
+            && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction
+    ) {
+      kind = LoopKind::Mul_Thread;
+    }
+  } 
+  else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
     kind = LoopKind::Vectorized;
   }
   else if (forall.getParallelUnit() != ParallelUnit::NotParallel
@@ -1166,6 +1255,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     kind = LoopKind::Runtime;
   }
 
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n";
   return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body,
                                  kind,
                                  ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()),
@@ -1179,6 +1269,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
                                                  set<Access> reducedAccesses,
                                                  ir::Stmt recoveryStmt)
   {
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n";
     taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor";
     taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars";
     taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops";
@@ -1204,6 +1295,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     }
 
     Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar));
+    // std::cout << "original forall : " << forall << std::endl;
+    // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
     Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses);
     Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit);
 
@@ -1216,7 +1309,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     Stmt posAppend = generateAppendPositions(appenders);
 
     LoopKind kind = LoopKind::Serial;
-    if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
+    if (should_use_ISPC_codegen()) {
+      if (forall.getParallelUnit() == ParallelUnit::CPUSimd) {
+        kind = LoopKind::Foreach;
+      }
+    }
+    else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
       kind = LoopKind::Vectorized;
     }
     else if (forall.getParallelUnit() != ParallelUnit::NotParallel
@@ -1224,6 +1322,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
       kind = LoopKind::Runtime;
     }
 
+    // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n";
     return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind,
                                          ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(),
                                          ignoreVectorize ? 0 : forall.getUnrollFactor()),
@@ -1247,6 +1346,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
                                       set<Access> reducedAccesses,
                                       ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl;
+
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
   Stmt declareCoordinate = Stmt();
   Stmt strideGuard = Stmt();
@@ -1278,6 +1379,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
     markAssignsAtomicDepth++;
   }
 
+  // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j)))
+  // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the 
+  // IndexStmt inside the forall IndexStmt
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1339,6 +1445,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
     kind = LoopKind::Runtime;
   }
 
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl;
   // Loop with preamble and postamble
   return Block::blanks(
                        boundsCompute,
@@ -1357,6 +1464,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
                                       set<Access> reducedAccesses,
                                       ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl;
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
   Stmt declareCoordinate = Stmt();
   if (provGraph.isCoordVariable(forall.getIndexVar())) {
@@ -1447,6 +1555,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
     markAssignsAtomicDepth++;
   }
 
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1503,6 +1613,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
            && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) {
     kind = LoopKind::Runtime;
   }
+
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl;
   // Loop with preamble and postamble
   return Block::blanks(boundsCompute,
                        Block::make(Block::make(searchForUnderivedStart),
@@ -1603,6 +1715,7 @@ Stmt LowererImplImperative::lowerMergePoint(MergeLattice pointLattice,
       ir::Assign::make(indexSetIter.getCoordVar(), indexSetIter.getPosVar())
     );
     // Code to increment both iterator variables.
+    std::cout << "some casting stuff happening\n";
     auto incr = ir::Block::make(
       compoundAssign(iter.getIteratorVar(), ir::Cast::make(Eq::make(iter.getCoordVar(), setMatch), iter.getIteratorVar().type())),
       compoundAssign(indexSetIter.getIteratorVar(), ir::Cast::make(Eq::make(indexSetIter.getCoordVar(), setMatch), indexSetIter.getIteratorVar().type())),
@@ -1765,6 +1878,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
                                   vector<Iterator> inserters,
                                   vector<Iterator> appenders,
                                   const set<Access>& reducedAccesses) {
+
+  // std::cout << "lowering a forall body----------------------------------------------------\n";
+  
   Stmt initVals = resizeAndInitValues(appenders, reducedAccesses);
 
   // Inserter positions
@@ -1780,6 +1896,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
 
   // Code of loop body statement
   Stmt body = lower(stmt);
+  // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n";
 
   // Code to append coordinates
   Stmt appendCoords = appendCoordinate(appenders, coordinate);
@@ -1797,10 +1914,12 @@ Expr LowererImplImperative::getTemporarySize(Where where) {
   TensorVar temporary = where.getTemporary();
   Dimension temporarySize = temporary.getType().getShape().getDimension(0);
   Access temporaryAccess = getResultAccesses(where.getProducer()).first[0];
+  std::cout << "temporaryAccess: " << temporaryAccess;
   std::vector<IndexVar> indexVars = temporaryAccess.getIndexVars();
 
   if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) {
     // All index vars underived then use tensor properties to get tensor size
+    std::cout << "All index vars underived then use tensor properties to get tensor size\n";
     taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0];
     ir::Expr size = dimensions.at(indexVars[0]);
     for(size_t i = 1; i < indexVars.size(); ++i) {
@@ -1811,16 +1930,19 @@ Expr LowererImplImperative::getTemporarySize(Where where) {
   }
 
   if (temporarySize.isFixed()) {
+    std::cout << "temporary is fixed\n" ;
     return ir::Literal::make(temporarySize.getSize());
   }
 
   if (temporarySize.isIndexVarSized()) {
+    std::cout << "temporary is index var sized\n";
     IndexVar var = temporarySize.getIndexVarSize();
     vector<Expr> bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds,
                                                      indexVarToExprMap, iterators);
     return ir::Sub::make(bounds[1], bounds[0]);
   }
 
+  std::cout << "should this be an error\n"; 
   taco_ierror; // TODO
   return Expr();
 }
@@ -1889,6 +2011,7 @@ vector<Stmt> LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where
     Expr p = Var::make("p" + temporary.getName(), Int());
     Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType));
 
+    // std::cout << "vector<Stmt> LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl;
     Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial);
     Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop);
     return {inits, freeTemps};
@@ -2090,8 +2213,10 @@ vector<Stmt> LowererImplImperative::codeToInitializeTemporaryParallel(Where wher
 
 vector<Stmt> LowererImplImperative::codeToInitializeTemporary(Where where) {
   TensorVar temporary = where.getTemporary();
+  cout << "temporary found: " << temporary << std::endl;
 
   const bool accelerateDense = canAccelerateDenseTemp(where).first;
+  cout << "accelerateDense: " << accelerateDense << std::endl;
 
   Stmt freeTemporary = Stmt();
   Stmt initializeTemporary = Stmt();
@@ -2102,6 +2227,7 @@ vector<Stmt> LowererImplImperative::codeToInitializeTemporary(Where where) {
     initializeTemporary = Block::make(initializeTemporary, initTempSet);
     tempToBitGuard[temporary] = tempSet;
   } else {
+    cout << "higher order temporary found: " << temporary << std::endl;
     // TODO: Need to support keeping track of initialized elements for
     //       temporaries that don't have sparse accelerator
     taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense);
@@ -2119,19 +2245,32 @@ vector<Stmt> LowererImplImperative::codeToInitializeTemporary(Where where) {
         needComputeValues(where, temporary)) {
       values = ir::Var::make(temporary.getName(),
                              temporary.getType().getDataType(), true, false);
-      taco_iassert(temporary.getType().getOrder() == 1)
-          << " Temporary order was " << temporary.getType().getOrder();  // TODO
+      std::cout << "values: " << values << std::endl;
+      std::cout << "dataType: " << values.type() << std::endl;
+      
+      // taco_iassert(temporary.getType().getOrder() == 1)
+      //     << " Temporary order was " << temporary.getType().getOrder();  // TODO
+
       Expr size = getTemporarySize(where);
+      std::cout << "temporarySize: " << size << std::endl;
+
 
       // no decl needed for shared memory
       Stmt decl = Stmt();
       if ((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
         decl = VarDecl::make(values, ir::Literal::make(0));
+        std::cout << "decl statement: " << decl << std::endl;
       }
       Stmt allocate = Allocate::make(values, size);
+      std::cout << "allocate stmt: " << allocate << std::endl;
 
       freeTemporary = Block::make(freeTemporary, Free::make(values));
+      std::cout << "free temp: " << freeTemporary << std::endl;
       initializeTemporary = Block::make(decl, initializeTemporary, allocate);
+      std::cout << "initializeTemporary: " << initializeTemporary << std::endl;
+
+      // taco_iassert(temporary.getType().getOrder() == 1)
+      //     << " Temporary order was " << temporary.getType().getOrder();  // TODO
     }
 
     /// Make a struct object that lowerAssignment and lowerAccess can read
@@ -2144,6 +2283,7 @@ vector<Stmt> LowererImplImperative::codeToInitializeTemporary(Where where) {
 }
 
 Stmt LowererImplImperative::lowerWhere(Where where) {
+  // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n";
   TensorVar temporary = where.getTemporary();
   bool accelerateDenseWorkSpace, sortAccelerator;
   std::tie(accelerateDenseWorkSpace, sortAccelerator) =
@@ -2180,6 +2320,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
         })
   );
 
+  // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl;
   Stmt consumer = lower(where.getConsumer());
   if (accelerateDenseWorkSpace && sortAccelerator) {
     // We need to sort the indices array
@@ -2203,11 +2344,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
                                 true, false);
     Expr size = getTemporarySize(where);
     Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+    // std::cout << "Stmt LowererImplImperative::lowerWhere\n";
     Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
     initializeTemporary = Block::make(initializeTemporary, loopInit);
   }
 
   whereConsumers.push_back(consumer);
+  // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl;
   whereTemps.push_back(where.getTemporary());
   captureNextLocatePos = true;
 
@@ -2218,6 +2361,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
     restoreAtomicDepth = true;
   }
 
+  whereTempsWithLoopDepth.insert(std::pair<TensorVar, int>(where.getTemporary(), loopDepth));
+
+  // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl;
   Stmt producer = lower(where.getProducer());
   if (accelerateDenseWorkSpace) {
     const Expr indexListSizeExpr = tempToIndexListSize.at(temporary);
@@ -2225,6 +2371,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
     initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary);
   }
 
+  whereTempsWithLoopDepth.erase(where.getTemporary());
+
   if (restoreAtomicDepth) {
     markAssignsAtomicDepth++;
   }
@@ -2334,6 +2482,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) {
                   resultModeOrdering[iter.getMode().getLevel() - 1]);
               Expr pos = iter.getPosVar();
               Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]);
+              // std::cout << "Stmt LowererImplImperative::lowerAssemble\n";
               insertEdgeLoop = For::make(coords.back(), 0, dim, 1,
                                          Block::make(initPos, insertEdgeLoop));
             } else {
@@ -2371,7 +2520,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) {
         initAssembleStmts.push_back(initValues);
       }
     } else if (zeroInit) {
-      initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize));
+      initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values
     }
   }
   Stmt initAssemble = Block::make(initAssembleStmts);
@@ -2415,6 +2564,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) {
 }
 
 Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) {
+  // std::cout << "lowering such that statement\n";
   Stmt stmt = lower(suchThat.getStmt());
   return Block::make(stmt);
 }
@@ -2528,6 +2678,7 @@ Expr LowererImplImperative::lowerSqrt(Sqrt sqrt) {
 
 
 Expr LowererImplImperative::lowerCast(Cast cast) {
+  std::cout << "casting: " << cast.getA() << ", dataType: " << cast.getDataType() << std::endl;
   return ir::Cast::make(lower(cast.getA()), cast.getDataType());
 }
 
@@ -2744,7 +2895,7 @@ Stmt LowererImplImperative::initResultArrays(vector<Access> writes,
       // iteration of all the iterators is not full. We can check this by seeing if we can recover a
       // full iterator from our set of iterators.
       Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize;
-      result.push_back(zeroInitValues(tensor, 0, size));
+      result.push_back(zeroInitValues(tensor, 0, size)); // init values
     }
   }
   return result.empty() ? Stmt() : Block::blanks(result);
@@ -2895,7 +3046,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector<Access> writes
             util::contains(reducedAccesses, write)) {
           // Zero-initialize values array if might not assign to every element
           // in values array during compute
-          result.push_back(zeroInitValues(tensor, resultParentPos, stride));
+          result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values
         }
       }
     }
@@ -2942,6 +3093,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector<Iterator>& app
 
 
 Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) {
+  // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n";
   Expr lower = simplify(ir::Mul::make(begin, size));
   Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size));
   Expr p = Var::make("p" + util::toString(tensor), Int());
@@ -2954,6 +3106,11 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) {
     return ir::VarDecl::make(ir::Var::make("status", Int()),
                                     ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int()));
   }
+  // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n";
+  // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used
+  if (should_use_ISPC_codegen()) {
+    return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init);
+  }
   return For::make(p, lower, upper, 1, zeroInit, parallel);
 }
 
@@ -3235,6 +3392,7 @@ Stmt LowererImplImperative::codeToIncIteratorVars(Expr coordinate, IndexVar coor
   for (auto& iterator : levelIterators) {
     Expr ivar = iterator.getIteratorVar();
     if (iterator.isUnique()) {
+      std::cout << "casting \n";
       Expr increment = iterator.isFull()
                      ? 1
                      : ir::Cast::make(Eq::make(iterator.getCoordVar(),
@@ -3505,6 +3663,7 @@ Expr LowererImplImperative::generateAssembleGuard(IndexExpr expr) {
     }
 
     void visit(const CastNode* node) {
+      std::cout << "lowering to cast node\n";
       expr = lower(node->a);
     }
 
diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h
index 4f5dc49af..da52fb782 100644
--- a/src/lower/tensor_path.h
+++ b/src/lower/tensor_path.h
@@ -2,6 +2,7 @@
 #define TACO_TENSOR_PATH_H
 
 #include <memory>
+#include <ostream>
 #include <vector>
 
 #include "taco/util/comparable.h"
@@ -47,14 +48,13 @@ class TensorPath : public util::Comparable<TensorPath> {
 
   friend bool operator==(const TensorPath&, const TensorPath&);
   friend bool operator<(const TensorPath&, const TensorPath&);
+  friend std::ostream& operator<<(std::ostream&, const TensorPath&);
 
 private:
   struct Content;
   std::shared_ptr<Content> content;
 };
 
-std::ostream& operator<<(std::ostream&, const TensorPath&);
-
 
 /// A step along a tensor path.
 class TensorPathStep : public util::Comparable<TensorPathStep> {
diff --git a/src/tensor.cpp b/src/tensor.cpp
index fab437ff1..176856196 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -10,6 +10,7 @@
 #include <utility>
 #include <mutex>
 
+#include "../test/util.h"
 #include "taco/cuda.h"
 #include "taco/format.h"
 #include "taco/taco_tensor_t.h"
@@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData,
 
 /// Pack coordinates into a data structure given by the tensor format.
 void TensorBase::pack() {
+  std::cout << "TensorBase::Pack() method\n";
   if (!needsPack()) {
     return;
   }
@@ -346,6 +348,7 @@ void TensorBase::pack() {
   taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0);
   const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize;
 
+  std::cout << "call helperFuncs\n";
   const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(),
                                               dimensions);
 
@@ -619,10 +622,12 @@ void TensorBase::compile() {
   IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment));
   stmt = reorderLoopsTopologically(stmt);
   stmt = insertTemporaries(stmt);
+  std::cout << "calling parallelizeOuterLoop(stmt)\n";
   stmt = parallelizeOuterLoop(stmt);
   compile(stmt, content->assembleWhileCompute);
 }
 void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
+  std::cout << "TensorBase::compile\n";
   if (!needsCompile()) {
     return;
   }
@@ -802,6 +807,63 @@ void TensorBase::assemble() {
   }
 }
 
+void TensorBase::compute(std::ofstream& statfile, std::string& sofile) {
+  taco_uassert(!needsCompile()) << error::compute_without_compile;
+  // if (!needsCompute()) {
+  //   return;
+  // }
+  setNeedsCompute(false);
+  // Sync operand tensors if needed.
+  auto operands = getTensors(getAssignment().getRhs());
+  for (auto& operand : operands) {
+    // std::cout << "operand: " << operand.second << std::endl;
+    operand.second.syncValues();
+    operand.second.removeDependentTensor(*this);
+  }
+
+  auto arguments = packArguments(*this);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), 
+      "\nkernel execution time: ", timevalue);
+  // this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    setNeedsAssemble(false);
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
+void TensorBase::compute(std::ofstream& statfile) {
+  taco_uassert(!needsCompile()) << error::compute_without_compile;
+  // if (!needsCompute()) {
+  //   return;
+  // }
+  setNeedsCompute(false);
+  // Sync operand tensors if needed.
+  auto operands = getTensors(getAssignment().getRhs());
+  for (auto& operand : operands) {
+    operand.second.syncValues();
+    operand.second.removeDependentTensor(*this);
+  }
+
+  auto arguments = packArguments(*this);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), 
+      "\nkernel execution time: ", timevalue);
+  // this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    setNeedsAssemble(false);
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::compute() {
   taco_uassert(!needsCompile()) << error::compute_without_compile;
   if (!needsCompute()) {
@@ -816,7 +878,9 @@ void TensorBase::compute() {
   }
 
   auto arguments = packArguments(*this);
+  std::cout << "running the compute function from the shared library\n";
   this->content->module->callFuncPacked("compute", arguments.data());
+  std::cout << "compute function executed\n";
 
   if (content->assembleWhileCompute) {
     setNeedsAssemble(false);
@@ -934,6 +998,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
   };
   const auto dims = util::map(dimensions, getDim);
 
+  set_ISPC_code_stream_enabled(false);
   if (format.getOrder() > 0) {
     const Format bufferFormat = COO(format.getOrder(), false, true, false,
                                     format.getModeOrdering());
@@ -951,6 +1016,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     }
 
     // Lower packing and iterator code.
+    std::cout << "1 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   } else {
@@ -964,12 +1030,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     IndexVar indexVar;
     IndexStmt assignment = (packedScalar() = bufferVector(indexVar));
     IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment));
+    std::cout << "2 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
 
     // Define and lower iterator code.
     IndexStmt iterateStmt = Yield({}, packedScalar());
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   }
+  std::cout << "Compiling the helperModule\n";
   helperModule->compile();
 
   helperFunctionsMutex.lock();
diff --git a/taco-uml.wsd b/taco-uml.wsd
new file mode 100644
index 000000000..4b8e39802
--- /dev/null
+++ b/taco-uml.wsd
@@ -0,0 +1,411 @@
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|-- IRNode
+IRNode <|-- BaseStmtNode
+IRNode <|-- BaseExprNode
+BaseStmtNode <|-- StmtNode
+BaseExprNode <|-- ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|-- IRHandle
+IRHandle <|-- Expr
+IRHandle <|-- Stmt
+
+IRHandle "1" *-- "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|-- IRVisitor
+IRVisitorStrict <|-- IRPrinter
+IRVisitorStrict <|-- IRRewriter
+IRVisitor <|-- IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|-- ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|-- RemoveRedundantStatements
+IRRewriter <|-- RemoveRedundantLoops
+IRRewriter <|-- RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {
+    -class FindVars
+}
+
+class FindVars {}
+
+IRPrinter <|-- CodeGen
+CodeGen <|-- CodeGen_C
+CodeGen <|-- CodeGen_ISPC
+CodeGen <|-- CodeGen_CUDA
+
+IRVisitor <|-- FindVars
+CodeGen_ISPC +-- FindVars
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|-- IndexStmtNode
+Uncopyable <|-- IndexStmtNode
+Manageable <|-- IndexExprNode
+Uncopyable <|-- IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|-- IndexStmt
+IndexStmt "1" *-- "1" IndexStmtNode
+IntrusivePtr <|-- IndexExpr
+IndexExpr "1" *-- "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|-- IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|-- IndexNotationVisitor
+IndexNotationVisitorStrict <|-- IndexNotationPrinter
+IndexNotationVisitor <|-- Matcher
+
+IndexExprVisitorStrict <|-- IndexExprRewriterStrict
+IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict
+IndexExprRewriterStrict <|-- IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|-- IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|-- LowererImpl
+Lowerer "1" *-- "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|-- Visitor
+LowererImpl "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImpl : contains
+
+LowererImpl <|-- LowererImplImperative
+LowererImplImperative "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImplImperative : contains
+
+@enduml
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 02464ce26..f4d848de0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,6 +11,7 @@ add_executable(taco-test ${TEST_SOURCES} ${TEST_HEADERS})
 target_link_libraries(taco-test taco-gtest)
 target_link_libraries(taco-test pthread)
 target_link_libraries(taco-test taco)
+target_link_libraries(taco-test papi)
 
 if(${CMAKE_VERSION} VERSION_LESS "3.9.0")
   add_test(NAME taco-test COMMAND taco-test)
diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c
new file mode 100644
index 000000000..9d0536b8c
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c
@@ -0,0 +1,177 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) {
+  int A18451_dimension = (int)(A1845->dimensions[0]);
+  int A18452_dimension = (int)(A1845->dimensions[1]);
+  double* restrict A1845_vals = (double*)(A1845->vals);
+
+  A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension));
+
+  A1845->vals = (uint8_t*)A1845_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) {
+  int A18451_dimension = (int)(A1845->dimensions[0]);
+  int A18452_dimension = (int)(A1845->dimensions[1]);
+  double* restrict A1845_vals = (double*)(A1845->vals);
+  int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]);
+  int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]);
+  int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]);
+  int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]);
+  int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]);
+  int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]);
+  double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals);
+  int A14751_dimension = (int)(A1475->dimensions[0]);
+  int A14752_dimension = (int)(A1475->dimensions[1]);
+  double* restrict A1475_vals = (double*)(A1475->vals);
+  int A14161_dimension = (int)(A1416->dimensions[0]);
+  int A14162_dimension = (int)(A1416->dimensions[1]);
+  double* restrict A1416_vals = (double*)(A1416->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) {
+    A1845_vals[pA1845] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) {
+    int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5];
+    for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) {
+      int32_t i1545A1845 = i1542 * A18452_dimension + i1545;
+      double ti1543A1845_val = 0.0;
+      for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) {
+        int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5];
+        int32_t i1545A1416 = i1543 * A14162_dimension + i1545;
+        for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) {
+          int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5];
+          int32_t i1545A1475 = i1544 * A14752_dimension + i1545;
+          ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416];
+        }
+      }
+      A1845_vals[i1545A1845] = ti1543A1845_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h
new file mode 100644
index 000000000..3d0c06f50
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416);
+#endif
diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c
new file mode 100644
index 000000000..edf8cdb16
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/taco_default.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) {
+  int A15381_dimension = (int)(A1538->dimensions[0]);
+  int A15382_dimension = (int)(A1538->dimensions[1]);
+  double* restrict A1538_vals = (double*)(A1538->vals);
+
+  A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension));
+
+  A1538->vals = (uint8_t*)A1538_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) {
+  int A15381_dimension = (int)(A1538->dimensions[0]);
+  int A15382_dimension = (int)(A1538->dimensions[1]);
+  double* restrict A1538_vals = (double*)(A1538->vals);
+  int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]);
+  int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]);
+  int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]);
+  int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]);
+  int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]);
+  int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]);
+  double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals);
+  int A14751_dimension = (int)(A1475->dimensions[0]);
+  int A14752_dimension = (int)(A1475->dimensions[1]);
+  double* restrict A1475_vals = (double*)(A1475->vals);
+  int A14161_dimension = (int)(A1416->dimensions[0]);
+  int A14162_dimension = (int)(A1416->dimensions[1]);
+  double* restrict A1416_vals = (double*)(A1416->vals);
+  int A14791_dimension = (int)(A1479->dimensions[0]);
+  int A14792_dimension = (int)(A1479->dimensions[1]);
+  double* restrict A1479_vals = (double*)(A1479->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) {
+    A1538_vals[pA1538] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) {
+    int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5];
+    for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) {
+      int32_t i1546A1538 = i1542 * A15382_dimension + i1546;
+      double ti1543A1538_val = 0.0;
+      for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) {
+        int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5];
+        for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) {
+          int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5];
+          for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) {
+            int32_t i1545A1475 = i1544 * A14752_dimension + i1545;
+            int32_t i1545A1416 = i1543 * A14162_dimension + i1545;
+            int32_t i1546A1479 = i1545 * A14792_dimension + i1546;
+            ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479];
+          }
+        }
+      }
+      A1538_vals[i1546A1538] = ti1543A1538_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h
new file mode 100644
index 000000000..54274569e
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/taco_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c
new file mode 100644
index 000000000..a5e031e7a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c
@@ -0,0 +1,199 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  int* restrict A25312_pos = (int*)(A2531->indices[1][0]);
+  int* restrict A25312_crd = (int*)(A2531->indices[1][1]);
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+
+  A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6);
+  A25312_pos[0] = 0;
+  for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) {
+    A25312_pos[pA25312] = 0;
+  }
+  int32_t A25312_crd_size = 1048576;
+  A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size);
+  int32_t i1468A2531 = 0;
+
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    int32_t pA25312_begin = i1468A2531;
+
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      if (A25312_crd_size <= i1468A2531) {
+        A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2));
+        A25312_crd_size *= 2;
+      }
+      A25312_crd[i1468A2531] = i1468;
+      i1468A2531++;
+    }
+
+    A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin;
+  }
+
+  int32_t csA25312 = 0;
+  for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) {
+    csA25312 += A25312_pos[pA253120];
+    A25312_pos[pA253120] = csA25312;
+  }
+
+  A2531_vals = (double*)malloc(sizeof(double) * i1468A2531);
+
+  A2531->indices[1][0] = (uint8_t*)(A25312_pos);
+  A2531->indices[1][1] = (uint8_t*)(A25312_crd);
+  A2531->vals = (uint8_t*)A2531_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  double* restrict cage3_vals = (double*)(cage3->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+
+//   int32_t i1468A2531 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      double ti1469A2531_val = 0.0;
+      for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+        int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+        int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+        ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451];
+      }
+      A2531_vals[i1468cage3] = ti1469A2531_val;
+    //   i1468A2531++;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h
new file mode 100644
index 000000000..a9d6b760d
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so
new file mode 100755
index 000000000..c2c5ca30e
Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so differ
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c
new file mode 100644
index 000000000..7f710f6c1
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c
@@ -0,0 +1,190 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) {
+  int A25352_dimension = (int)(A2535->dimensions[1]);
+  double* restrict A2535_vals = (double*)(A2535->vals);
+
+  A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension));
+
+  A2535->vals = (uint8_t*)A2535_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) {
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int A1_dimension = (int)(A->dimensions[0]);
+  int* restrict A2_pos = (int*)(A->indices[1][0]);
+  int* restrict A2_crd = (int*)(A->indices[1][1]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int B2_dimension = (int)(B->dimensions[1]);
+  double* restrict B_vals = (double*)(B->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) {
+    C_vals[pC] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(dynamic, 1)
+  for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i = i0 * 16 + i1;
+      if (i >= A1_dimension)
+        continue;
+
+      for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) {
+        int32_t jposA = jpos0 * 4;
+        if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) {
+          for (int32_t k = 0; k < B2_dimension; k++) {
+            int32_t kC = i * C2_dimension + k;
+            for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) {
+              int32_t jposA = jpos0 * 4 + jpos1;
+              if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)])
+                continue;
+
+              int32_t j = A2_crd[jposA];
+              int32_t kB = j * B2_dimension + k;
+              C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB];
+            }
+          }
+        }
+        else {
+          #pragma clang loop interleave(enable) vectorize(enable)
+          for (int32_t k = 0; k < B2_dimension; k++) {
+            int32_t kC = i * C2_dimension + k;
+            for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) {
+              int32_t jposA = jpos0 * 4 + jpos1;
+              int32_t j = A2_crd[jposA];
+              int32_t kB = j * B2_dimension + k;
+              C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB];
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h
new file mode 100644
index 000000000..cf0cf205c
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so
new file mode 100755
index 000000000..398362532
Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_spmm.so differ
diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c
new file mode 100644
index 000000000..1572bce5a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/fused_kernel.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14592_dimension = (int)(A1459->dimensions[1]);
+  double* restrict A1459_vals = (double*)(A1459->vals);
+
+  A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension));
+
+  A1459->vals = (uint8_t*)A1459_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14591_dimension = (int)(A1459->dimensions[0]);
+  int A14592_dimension = (int)(A1459->dimensions[1]);
+  double* restrict A1459_vals = (double*)(A1459->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14511_dimension = (int)(A1451->dimensions[0]);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+  int A14551_dimension = (int)(A1455->dimensions[0]);
+  int A14552_dimension = (int)(A1455->dimensions[1]);
+  double* restrict A1455_vals = (double*)(A1455->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) {
+    A1459_vals[pA1459] = 0.0;
+  }
+  
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) {
+
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i1467 = i0 * 16 + i1;
+      if (i1467 >= A13921_dimension)
+        continue;
+
+      for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) {
+        int32_t i1468 = B2_crd[i1468B];
+        double tA1459_val = 0.0;
+        for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+          int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+          int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+          tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451];
+        }
+        for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) {
+          int32_t i1470A1459 = i1467 * A14592_dimension + i1470;
+          int32_t i1470A1455 = i1468 * A14552_dimension + i1470;
+          A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455];
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h
new file mode 100644
index 000000000..e67e5a761
--- /dev/null
+++ b/test/kernels/sddmm_spmm/fused_kernel.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so
new file mode 100755
index 000000000..10619e0ca
Binary files /dev/null and b/test/kernels/sddmm_spmm/fused_kernel.so differ
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.c b/test/kernels/sddmm_spmm/sddmm_ryan.c
new file mode 100644
index 000000000..760fb5361
--- /dev/null
+++ b/test/kernels/sddmm_spmm/sddmm_ryan.c
@@ -0,0 +1,210 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  int* restrict A25312_pos = (int*)(A2531->indices[1][0]);
+  int* restrict A25312_crd = (int*)(A2531->indices[1][1]);
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+
+  A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6);
+  A25312_pos[0] = 0;
+  for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) {
+    A25312_pos[pA25312] = 0;
+  }
+  int32_t A25312_crd_size = 1048576;
+  A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size);
+  int32_t i1468A2531 = 0;
+
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    int32_t pA25312_begin = i1468A2531;
+
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      if (A25312_crd_size <= i1468A2531) {
+        A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2));
+        A25312_crd_size *= 2;
+      }
+      A25312_crd[i1468A2531] = i1468;
+      i1468A2531++;
+    }
+
+    A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin;
+  }
+
+  int32_t csA25312 = 0;
+  for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) {
+    csA25312 += A25312_pos[pA253120];
+    A25312_pos[pA253120] = csA25312;
+  }
+
+  A2531_vals = (double*)malloc(sizeof(double) * i1468A2531);
+
+  A2531->indices[1][0] = (uint8_t*)(A25312_pos);
+  A2531->indices[1][1] = (uint8_t*)(A25312_crd);
+  A2531->vals = (uint8_t*)A2531_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+
+  int A1_dimension = (int)(A->dimensions[0]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  int32_t jA = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) {
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i = i0 * 16 + i1;
+      if (i >= C1_dimension)
+        continue;
+
+      for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) {
+        int32_t j = B2_crd[jB];
+        double tkA_val = 0.0;
+        for (int32_t k = 0; k < D2_dimension; k++) {
+          int32_t kC = i * C2_dimension + k;
+          int32_t kD = j * D2_dimension + k;
+          tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD];
+        }
+        A_vals[jB] = tkA_val;
+        // jA++;
+      }
+    }
+  }
+  return 0;
+
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h
new file mode 100644
index 000000000..f0f9e372a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/sddmm_ryan.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so
new file mode 100755
index 000000000..c3deae084
Binary files /dev/null and b/test/kernels/sddmm_spmm/sddmm_ryan.so differ
diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c
new file mode 100644
index 000000000..4f084ff5e
--- /dev/null
+++ b/test/kernels/sddmm_spmm/taco_original.c
@@ -0,0 +1,166 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14632_dimension = (int)(A1463->dimensions[1]);
+  double* restrict A1463_vals = (double*)(A1463->vals);
+
+  A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension));
+
+  A1463->vals = (uint8_t*)A1463_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14632_dimension = (int)(A1463->dimensions[1]);
+  double* restrict A1463_vals = (double*)(A1463->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  double* restrict cage3_vals = (double*)(cage3->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+  int A14552_dimension = (int)(A1455->dimensions[1]);
+  double* restrict A1455_vals = (double*)(A1455->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) {
+      int32_t i1470A1463 = i1467 * A14632_dimension + i1470;
+      double ti1468A1463_val = 0.0;
+      for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+        int32_t i1468 = cage32_crd[i1468cage3];
+        int32_t i1470A1455 = i1468 * A14552_dimension + i1470;
+        for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+          int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+          int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+          ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455];
+        }
+      }
+      A1463_vals[i1470A1463] = ti1468A1463_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h
new file mode 100644
index 000000000..71ce53402
--- /dev/null
+++ b/test/kernels/sddmm_spmm/taco_original.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so
new file mode 100755
index 000000000..f50931baa
Binary files /dev/null and b/test/kernels/sddmm_spmm/taco_original.so differ
diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c
new file mode 100644
index 000000000..605cc491f
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_default.c
@@ -0,0 +1,160 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+
+  A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension));
+
+  A2039->vals = (uint8_t*)A2039_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+  int A20351_dimension = (int)(A2035->dimensions[0]);
+  int A20352_dimension = (int)(A2035->dimensions[1]);
+  double* restrict A2035_vals = (double*)(A2035->vals);
+  int A14501_dimension = (int)(A1450->dimensions[0]);
+  int A14502_dimension = (int)(A1450->dimensions[1]);
+  double* restrict A1450_vals = (double*)(A1450->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) {
+    for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) {
+      int32_t i1520A2039 = i1517 * A20392_dimension + i1520;
+      double ti1519A2039_val = 0.0;
+      for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) {
+        int32_t i1519A2035 = i1517 * A20352_dimension + i1519;
+        int32_t i1520A1450 = i1519 * A14502_dimension + i1520;
+        ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450];
+      }
+      A2039_vals[i1520A2039] = ti1519A2039_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h
new file mode 100644
index 000000000..769514531
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so
new file mode 100755
index 000000000..9de7a7933
Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_default.so differ
diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c
new file mode 100644
index 000000000..4a4e5faeb
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_template.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+
+  A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension));
+
+  A2039->vals = (uint8_t*)A2039_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
+  int A1_dimension = (int)(A->dimensions[0]);
+  int A2_dimension = (int)(A->dimensions[1]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int B2_dimension = (int)(B->dimensions[1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) {
+    A_vals[pA] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) {
+    for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) {
+      for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) {
+        for (int32_t i1 = 0; i1 < 16; i1++) {
+          int32_t i = i0 * 16 + i1;
+          if (i >= B1_dimension)
+            continue;
+
+          for (int32_t j1 = 0; j1 < 16; j1++) {
+            int32_t j = j0 * 16 + j1;
+            int32_t jB = i * B2_dimension + j;
+            int32_t jA = i * A2_dimension + j;
+            if (j >= C1_dimension)
+              continue;
+
+            double tk1A_val = 0.0;
+            for (int32_t k1 = 0; k1 < 16; k1++) {
+              int32_t k = k0 * 16 + k1;
+              int32_t kC = j * C2_dimension + k;
+              if (k >= C2_dimension)
+                continue;
+
+              tk1A_val += B_vals[jB] * C_vals[kC];
+            }
+            A_vals[jA] = A_vals[jA] + tk1A_val;
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h
new file mode 100644
index 000000000..769514531
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_template.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so
new file mode 100755
index 000000000..2cfcd7ad3
Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_template.so differ
diff --git a/test/kernels/spmv_spmv/spmv_fused.c b/test/kernels/spmv_spmv/spmv_fused.c
new file mode 100644
index 000000000..0964fb8e1
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_fused.c
@@ -0,0 +1,178 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) {
+  double* restrict A_vals = (double*)(A->vals);
+
+  A_vals = (double*)malloc(sizeof(double) * 5);
+
+  A->vals = (uint8_t*)A_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) {
+  printf("Adhitha1\n");
+
+  double* restrict A_vals = (double*)(A->vals);
+  int* restrict C2_pos = (int*)(C->indices[1][0]);
+  int* restrict C2_crd = (int*)(C->indices[1][1]);
+  double* restrict C_vals = (double*)(C->vals);
+  double* restrict v_vals = (double*)(v->vals);
+  printf("Adhitha2\n");
+  int B1_dimension = (int)(B->dimensions[0]);
+  int C1_dimension = (int)(B->dimensions[0]);
+  printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  printf("Adhitha4\n");
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  printf("Adhitha2\n");
+  double* restrict B_vals = (double*)(B->vals);
+
+  printf("Adhitha3\n");
+
+  double* restrict tA = 0;
+  tA = (double*)malloc(sizeof(double) * C1_dimension);
+  for (int32_t ptA = 0; ptA < C1_dimension; ptA++) {
+    tA[ptA] = 0.0;
+  }
+  for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) {
+    double ti1440tA_val = 0.0;
+    for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) {
+      int32_t i1440 = C2_crd[i1440C];
+      ti1440tA_val += C_vals[i1440C] * v_vals[i1440];
+    }
+    tA[i1439] = ti1440tA_val;
+  }
+  for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) {
+    double ti1439A_val = 0.0;
+    for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) {
+      int32_t i1439 = B2_crd[i1439B];
+      ti1439A_val += B_vals[i1439B] * tA[i1439];
+    }
+    A_vals[i1438] = ti1439A_val;
+  }
+  free(tA);
+
+  A->vals = (uint8_t*)A_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h
new file mode 100644
index 000000000..bc78275ac
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_fused.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B);
+#endif
diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so
new file mode 100755
index 000000000..5efd6a4d8
Binary files /dev/null and b/test/kernels/spmv_spmv/spmv_fused.so differ
diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c
new file mode 100644
index 000000000..dfaa1c4b0
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_spmv_default.c
@@ -0,0 +1,157 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) {
+  double* restrict ref_vals = (double*)(ref->vals);
+
+  ref_vals = (double*)malloc(sizeof(double) * 5);
+
+  ref->vals = (uint8_t*)ref_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) {
+  double* restrict ref_vals = (double*)(ref->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int* restrict C2_pos = (int*)(C->indices[1][0]);
+  int* restrict C2_crd = (int*)(C->indices[1][1]);
+  double* restrict C_vals = (double*)(C->vals);
+  double* restrict v_vals = (double*)(v->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) {
+    double ti1439ref_val = 0.0;
+    for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) {
+      int32_t i1439 = B2_crd[i1439B];
+      for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) {
+        int32_t i1440 = C2_crd[i1440C];
+        ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440];
+      }
+    }
+    ref_vals[i1438] = ti1439ref_val;
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h
new file mode 100644
index 000000000..b53193484
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_spmv_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v);
+#endif
diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c
new file mode 100644
index 000000000..5d40c8aa9
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused copy.c	
@@ -0,0 +1,248 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  int* restrict A15322_crd = (int*)(A1532->indices[1][1]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1));
+  A15322_pos[0] = 0;
+  for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) {
+    A15322_pos[pA15322] = 0;
+  }
+  int32_t A15322_crd_size = 1048576;
+  A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size);
+  int32_t i1543A1532 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15322_begin = i1543A1532;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15322_crd_size <= i1543A1532) {
+          A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2));
+          A15322_crd_size *= 2;
+        }
+        A15322_crd[i1543A1532] = i1543;
+        i1543A1532++;
+      }
+
+      A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin;
+    }
+  }
+
+  int32_t csA15322 = 0;
+  for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) {
+    csA15322 += A15322_pos[pA153220];
+    A15322_pos[pA153220] = csA15322;
+  }
+
+  A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension));
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->indices[1][1] = (uint8_t*)(A15322_crd);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A1532 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) {
+    A1532_vals[pA1532] = 0.0;
+  }
+
+  double* restrict rA1532_all = 0;
+  tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads());
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      double* restrict tA1532 = 0;
+      tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()];
+      // tA1532 = (double*)malloc(sizeof(double) * D1_dimension);
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) {
+          tA1532[ptA1532] = 0.0;
+        }
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C];
+          }
+        }
+        for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+          for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+            int32_t i1546A1532 = i1543B * A15323_dimension + i1546;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D];
+          }
+        }
+        // i1543A1532++;
+      }
+
+      
+    }
+    
+  }
+  free(tA1532_all);
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c
new file mode 100644
index 000000000..f490913cb
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused.c
@@ -0,0 +1,242 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  int* restrict A15322_crd = (int*)(A1532->indices[1][1]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1));
+  A15322_pos[0] = 0;
+  for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) {
+    A15322_pos[pA15322] = 0;
+  }
+  int32_t A15322_crd_size = 1048576;
+  A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size);
+  int32_t i1543A1532 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15322_begin = i1543A1532;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15322_crd_size <= i1543A1532) {
+          A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2));
+          A15322_crd_size *= 2;
+        }
+        A15322_crd[i1543A1532] = i1543;
+        i1543A1532++;
+      }
+
+      A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin;
+    }
+  }
+
+  int32_t csA15322 = 0;
+  for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) {
+    csA15322 += A15322_pos[pA153220];
+    A15322_pos[pA153220] = csA15322;
+  }
+
+  A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension));
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->indices[1][1] = (uint8_t*)(A15322_crd);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A1532 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) {
+    A1532_vals[pA1532] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      double* restrict tA1532 = 0;
+      tA1532 = (double*)malloc(sizeof(double) * D1_dimension);
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) {
+          tA1532[ptA1532] = 0.0;
+        }
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C];
+          }
+        }
+        for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+          for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+            int32_t i1546A1532 = i1543B * A15323_dimension + i1546;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D];
+          }
+        }
+        // i1543A1532++;
+      }
+
+      free(tA1532);
+    }
+  }
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h
new file mode 100644
index 000000000..d613c8f07
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so
new file mode 100755
index 000000000..69c65a1dc
Binary files /dev/null and b/test/kernels/ttm_ttm/fused.so differ
diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c
new file mode 100644
index 000000000..ee2b24e99
--- /dev/null
+++ b/test/kernels/ttm_ttm/gemm.c
@@ -0,0 +1,181 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) {
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+
+  A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension));
+
+  A2886->vals = (uint8_t*)A2886_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) {
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) {
+    A2886_vals[pA2886] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) {
+    for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) {
+      for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) {
+        for (int32_t i1552 = 0; i1552 < 32; i1552++) {
+          int32_t i1544 = i1551 * 32 + i1552;
+          if (i1544 >= C1_dimension)
+            continue;
+
+          for (int32_t i1554 = 0; i1554 < 32; i1554++) {
+            int32_t i1545 = i1553 * 32 + i1554;
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            if (i1545 >= D1_dimension)
+              continue;
+
+            for (int32_t i1556 = 0; i1556 < 32; i1556++) {
+              int32_t i1546 = i1555 * 32 + i1556;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              int32_t i1546A2886 = i1544 * A28862_dimension + i1546;
+              if (i1546 >= D2_dimension)
+                continue;
+
+              A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D];
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h
new file mode 100644
index 000000000..20cd2db53
--- /dev/null
+++ b/test/kernels/ttm_ttm/gemm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c
new file mode 100644
index 000000000..e016491a2
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_1.c
@@ -0,0 +1,219 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) {
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1));
+  A23982_pos[0] = 0;
+  for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) {
+    A23982_pos[pA23982] = 0;
+  }
+  int32_t A23982_crd_size = 1048576;
+  A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size);
+  int32_t i1543A2398 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA23982_begin = i1543A2398;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A23982_crd_size <= i1543A2398) {
+          A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2));
+          A23982_crd_size *= 2;
+        }
+        A23982_crd[i1543A2398] = i1543;
+        i1543A2398++;
+      }
+
+      A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin;
+    }
+  }
+
+  int32_t csA23982 = 0;
+  for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) {
+    csA23982 += A23982_pos[pA239820];
+    A23982_pos[pA239820] = csA23982;
+  }
+
+  A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension));
+
+  A2398->indices[1][0] = (uint8_t*)(A23982_pos);
+  A2398->indices[1][1] = (uint8_t*)(A23982_crd);
+  A2398->vals = (uint8_t*)A2398_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) {
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+
+  // int32_t i1543A2398 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) {
+          // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545;
+          int32_t i1545A2398 = i1543B * A23983_dimension + i1545;
+          double ti1544A2398_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C];
+          }
+          A2398_vals[i1545A2398] = ti1544A2398_val;
+        }
+        // i1543A2398++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h
new file mode 100644
index 000000000..4c631f227
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_1.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so
new file mode 100755
index 000000000..911c44fa1
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_1.so differ
diff --git a/test/kernels/ttm_ttm/ttm1_2.c b/test/kernels/ttm_ttm/ttm1_2.c
new file mode 100644
index 000000000..b04e23a54
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_2.c
@@ -0,0 +1,219 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) {
+  int A30561_dimension = (int)(A3056->dimensions[0]);
+  int A30563_dimension = (int)(A3056->dimensions[2]);
+  int* restrict A30562_pos = (int*)(A3056->indices[1][0]);
+  int* restrict A30562_crd = (int*)(A3056->indices[1][1]);
+  double* restrict A3056_vals = (double*)(A3056->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1));
+  A30562_pos[0] = 0;
+  for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) {
+    A30562_pos[pA30562] = 0;
+  }
+  int32_t A30562_crd_size = 1048576;
+  A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size);
+  int32_t i1543A3056 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA30562_begin = i1543A3056;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A30562_crd_size <= i1543A3056) {
+          A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2));
+          A30562_crd_size *= 2;
+        }
+        A30562_crd[i1543A3056] = i1543;
+        i1543A3056++;
+      }
+
+      A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin;
+    }
+  }
+
+  int32_t csA30562 = 0;
+  for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) {
+    csA30562 += A30562_pos[pA305620];
+    A30562_pos[pA305620] = csA30562;
+  }
+
+  A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension));
+
+  A3056->indices[1][0] = (uint8_t*)(A30562_pos);
+  A3056->indices[1][1] = (uint8_t*)(A30562_crd);
+  A3056->vals = (uint8_t*)A3056_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) {
+  int A30561_dimension = (int)(A3056->dimensions[0]);
+  int A30563_dimension = (int)(A3056->dimensions[2]);
+  double* restrict A3056_vals = (double*)(A3056->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+
+  // int32_t i1543A3056 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) {
+          // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546;
+          int32_t i1546A3056 = i1543B * A30563_dimension + i1546;
+          double ti1544A3056_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            int32_t i1546A2886 = i1544 * A28862_dimension + i1546;
+            ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886];
+          }
+          A3056_vals[i1546A3056] = ti1544A3056_val;
+        }
+        // i1543A3056++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h
new file mode 100644
index 000000000..86ebdb633
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so
new file mode 100755
index 000000000..c698ec991
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_2.so differ
diff --git a/test/kernels/ttm_ttm/ttm2.c b/test/kernels/ttm_ttm/ttm2.c
new file mode 100644
index 000000000..e98f44e35
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm2.c
@@ -0,0 +1,218 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) {
+  int A25931_dimension = (int)(A2593->dimensions[0]);
+  int A25933_dimension = (int)(A2593->dimensions[2]);
+  int* restrict A25932_pos = (int*)(A2593->indices[1][0]);
+  int* restrict A25932_crd = (int*)(A2593->indices[1][1]);
+  double* restrict A2593_vals = (double*)(A2593->vals);
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+
+  A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1));
+  A25932_pos[0] = 0;
+  for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) {
+    A25932_pos[pA25932] = 0;
+  }
+  int32_t A25932_crd_size = 1048576;
+  A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size);
+  int32_t i1543A2593 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= A23981_dimension)
+        continue;
+
+      int32_t pA25932_begin = i1543A2593;
+
+      for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) {
+        int32_t i1543 = A23982_crd[i1543A2398];
+        if (A25932_crd_size <= i1543A2593) {
+          A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2));
+          A25932_crd_size *= 2;
+        }
+        A25932_crd[i1543A2593] = i1543;
+        i1543A2593++;
+      }
+
+      A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin;
+    }
+  }
+
+  int32_t csA25932 = 0;
+  for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) {
+    csA25932 += A25932_pos[pA259320];
+    A25932_pos[pA259320] = csA25932;
+  }
+
+  A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension));
+
+  A2593->indices[1][0] = (uint8_t*)(A25932_pos);
+  A2593->indices[1][1] = (uint8_t*)(A25932_crd);
+  A2593->vals = (uint8_t*)A2593_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) {
+  int A25931_dimension = (int)(A2593->dimensions[0]);
+  int A25933_dimension = (int)(A2593->dimensions[2]);
+  double* restrict A2593_vals = (double*)(A2593->vals);
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A2593 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= A23981_dimension)
+        continue;
+
+      for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+        //   int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546;
+          int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546;
+          double ti1545A2593_val = 0.0;
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D];
+          }
+          A2593_vals[i1546A2593] = ti1545A2593_val;
+        }
+        // i1543A2593++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h
new file mode 100644
index 000000000..40f1400d1
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so
new file mode 100755
index 000000000..16a3d2542
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm2.so differ
diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c
new file mode 100644
index 000000000..cb21b209f
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original copy 2.c	
@@ -0,0 +1,242 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) {
+    A1537_vals[pA1537] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) {
+            for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) {
+              for (int32_t i1554 = 0; i1554 < 32; i1554++) {
+                int32_t i1545 = i1553 * 32 + i1554;
+                int32_t i1545C = i1544 * C2_dimension + i1545;
+                if (i1545 >= D1_dimension)
+                  continue;
+
+                for (int32_t i1556 = 0; i1556 < 32; i1556++) {
+                  int32_t i1546 = i1555 * 32 + i1556;
+                  // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+                  int32_t i1546A1537 = i1544B * A15373_dimension + i1546;
+                  int32_t i1546D = i1545 * D2_dimension + i1546;
+                  if (i1546 >= D2_dimension)
+                    continue;
+
+                  A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+                }
+              }
+            }
+          }
+        }
+        
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c
new file mode 100644
index 000000000..2db396c0a
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original copy.c	
@@ -0,0 +1,225 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+          // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+          int32_t i1546A1537 = i1543B * A15373_dimension + i1546;
+          double ti1544A1537_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+              int32_t i1545C = i1544 * C2_dimension + i1545;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+            }
+          }
+          A1537_vals[i1546A1537] = ti1544A1537_val;
+        }
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c
new file mode 100644
index 000000000..ac2674239
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original.c
@@ -0,0 +1,226 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+          // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+          int32_t i1546A1537 = i1543B * A15373_dimension + i1546;
+          double ti1544A1537_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+              int32_t i1545C = i1544 * C2_dimension + i1545;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+            }
+          }
+          A1537_vals[i1546A1537] = ti1544A1537_val;
+        }
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h
new file mode 100644
index 000000000..a27841047
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so
new file mode 100755
index 000000000..fa04aed35
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original.so differ
diff --git a/test/kernels/ttm_ttm/ttm_original2.c b/test/kernels/ttm_ttm/ttm_original2.c
new file mode 100644
index 000000000..8dd62d6dd
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original2.c
@@ -0,0 +1,229 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15421_dimension = (int)(A1542->dimensions[0]);
+  int A15423_dimension = (int)(A1542->dimensions[2]);
+  int* restrict A15422_pos = (int*)(A1542->indices[1][0]);
+  int* restrict A15422_crd = (int*)(A1542->indices[1][1]);
+  double* restrict A1542_vals = (double*)(A1542->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1));
+  A15422_pos[0] = 0;
+  for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) {
+    A15422_pos[pA15422] = 0;
+  }
+  int32_t A15422_crd_size = 1048576;
+  A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size);
+  int32_t i1548A1542 = 0;
+
+  for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) {
+    for (int32_t i1553 = 0; i1553 < 16; i1553++) {
+      int32_t i1547 = i1552 * 16 + i1553;
+      if (i1547 >= B1_dimension)
+        continue;
+
+      int32_t pA15422_begin = i1548A1542;
+
+      for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) {
+        int32_t i1548 = B2_crd[i1548B];
+        if (A15422_crd_size <= i1548A1542) {
+          A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2));
+          A15422_crd_size *= 2;
+        }
+        A15422_crd[i1548A1542] = i1548;
+        i1548A1542++;
+      }
+
+      A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin;
+    }
+  }
+
+  int32_t csA15422 = 0;
+  for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) {
+    csA15422 += A15422_pos[pA154220];
+    A15422_pos[pA154220] = csA15422;
+  }
+
+  A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension));
+
+  A1542->indices[1][0] = (uint8_t*)(A15422_pos);
+  A1542->indices[1][1] = (uint8_t*)(A15422_crd);
+  A1542->vals = (uint8_t*)A1542_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15421_dimension = (int)(A1542->dimensions[0]);
+  int A15423_dimension = (int)(A1542->dimensions[2]);
+  int* restrict A15422_pos = (int*)(A1542->indices[1][0]);
+  double* restrict A1542_vals = (double*)(A1542->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1548A1542 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) {
+    A1542_vals[pA1542] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) {
+    for (int32_t i1553 = 0; i1553 < 16; i1553++) {
+      int32_t i1547 = i1552 * 16 + i1553;
+      if (i1547 >= B1_dimension)
+        continue;
+
+      for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) {
+        for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) {
+          int32_t i1549 = B3_crd[i1549B];
+          for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) {
+            int32_t i1550C = i1549 * C2_dimension + i1550;
+            for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) {
+            //   int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551;
+              int32_t i1551A1542 = i1548B * A15423_dimension + i1551;
+              int32_t i1551D = i1550 * D2_dimension + i1551;
+              A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D];
+            }
+          }
+        }
+        // i1548A1542++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h
new file mode 100644
index 000000000..8a08b4548
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so
new file mode 100755
index 000000000..6466a2af2
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original2.so differ
diff --git a/test/stats/hadamard-gemm.txt b/test/stats/hadamard-gemm.txt
new file mode 100644
index 000000000..6e730cf50
--- /dev/null
+++ b/test/stats/hadamard-gemm.txt
@@ -0,0 +1,921 @@
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  22.4288 ms
+fused time: 23.1383
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  8.99985 ms
+fused time: 9.71943
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  8.65832 ms
+fused time: 9.33544
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  21.7432 ms
+fused time: 22.466
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  25.8057 ms
+fused time: 26.4891
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  26.7972 ms
+fused time: 27.2892
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  46.4376 ms
+fused time: 47.1315
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  26.8781 ms
+fused time: 27.4325
+
+kernel execution time:  61.7475 ms
+taco reference time: 62.3899
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  25.4837 ms
+fused time: 25.9563
+
+kernel execution time:  15.5567 ms
+sddmm time: 16.2101
+
+kernel execution time:  73.7443 ms
+taco reference time: 74.42
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  24.5312 ms
+fused time: 25.0641
+
+kernel execution time:  14.7877 ms
+hadamard time: 15.4539
+
+kernel execution time:  18.149 ms
+gemm time: 18.7191
+
+kernel execution time:  73.8142 ms
+taco reference time: 74.4567
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 128, E2_dimension: 64, vals: 8192
+
+
+kernel execution time:  36.5794 ms
+fused time: 37.1963
+
+kernel execution time:  31.9277 ms
+hadamard time: 32.6108
+
+kernel execution time:  28.0947 ms
+gemm time: 28.7572
+
+kernel execution time:  203.157 ms
+taco reference time: 203.921
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  42.4207 ms
+fused time: 42.9584
+
+kernel execution time:  31.1526 ms
+hadamard time: 31.8623
+
+kernel execution time:  62.6041 ms
+gemm time: 63.199
+
+kernel execution time:  416.714 ms
+taco reference time: 417.403
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 5558326, D2_dimension: 128, vals: 711465728
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  1265.12 ms
+fused time: 1266.15
+
+kernel execution time:  4815.82 ms
+hadamard time: 4816.95
+
+kernel execution time:  1478.77 ms
+gemm time: 1479.51
+
+kernel execution time:  63618.8 ms
+taco reference time: 63619.9
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 10974, D2_dimension: 128, vals: 1404672
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  4.44366 ms
+fused time: 5.30002
+
+kernel execution time:  1.60353 ms
+hadamard time: 2.06029
+
+kernel execution time:  4.56709 ms
+gemm time: 4.9084
+
+kernel execution time:  52.2837 ms
+taco reference time: 52.7156
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 36417, D2_dimension: 128, vals: 4661376
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  13.0806 ms
+fused time: 13.6544
+
+kernel execution time:  12.1216 ms
+hadamard time: 12.8046
+
+kernel execution time:  11.8732 ms
+gemm time: 12.47
+
+kernel execution time:  477.422 ms
+taco reference time: 477.987
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 46835, D2_dimension: 128, vals: 5994880
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  13.6475 ms
+fused time: 14.2071
+
+kernel execution time:  12.1816 ms
+hadamard time: 12.8468
+
+kernel execution time:  14.7018 ms
+gemm time: 15.233
+
+kernel execution time:  251.649 ms
+taco reference time: 252.229
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 62451, D2_dimension: 128, vals: 7993728
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  20.2137 ms
+fused time: 20.7037
+
+kernel execution time:  19.6828 ms
+hadamard time: 20.2722
+
+kernel execution time:  18.5323 ms
+gemm time: 19.0234
+
+kernel execution time:  415.255 ms
+taco reference time: 415.805
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 83334, D2_dimension: 128, vals: 10666752
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  28.1295 ms
+fused time: 28.6289
+
+kernel execution time:  28.2393 ms
+hadamard time: 28.8514
+
+kernel execution time:  24.2246 ms
+gemm time: 24.7551
+
+kernel execution time:  597.455 ms
+taco reference time: 598.049
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 121192, D2_dimension: 128, vals: 15512576
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  49.6444 ms
+fused time: 50.1899
+
+kernel execution time:  45.97 ms
+hadamard time: 46.6381
+
+kernel execution time:  33.5119 ms
+gemm time: 34.0815
+
+kernel execution time:  258.507 ms
+taco reference time: 259.153
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  42.1499 ms
+fused time: 42.7069
+
+kernel execution time:  41.9158 ms
+hadamard time: 42.597
+
+kernel execution time:  37.5761 ms
+gemm time: 38.1603
+
+kernel execution time:  748.178 ms
+taco reference time: 748.913
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  32.0664 ms
+fused time: 32.5614
+
+kernel execution time:  27.8304 ms
+hadamard time: 28.5102
+
+kernel execution time:  45.5743 ms
+gemm time: 46.1921
+
+kernel execution time:  97.9936 ms
+taco reference time: 98.6611
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  42.0101 ms
+fused time: 42.5555
+
+kernel execution time:  38.2596 ms
+hadamard time: 38.9704
+
+kernel execution time:  55.2502 ms
+gemm time: 55.8132
+
+kernel execution time:  128.93 ms
+taco reference time: 129.615
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 1000005, D2_dimension: 128, vals: 128000640
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  156.672 ms
+fused time: 157.149
+
+kernel execution time:  108.579 ms
+hadamard time: 109.187
+
+kernel execution time:  266.855 ms
+gemm time: 267.343
+
+kernel execution time:  325.2 ms
+taco reference time: 325.907
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 5558326, D2_dimension: 128, vals: 711465728
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  1267.69 ms
+fused time: 1268.78
+
+kernel execution time:  1173.34 ms
+hadamard time: 1174.13
+
+kernel execution time:  1502.45 ms
+gemm time: 1503.33
+
+kernel execution time:  12918.1 ms
+taco reference time: 12919.5
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  44.4685 ms
+fused time: 47.652
+
+kernel execution time:  39.859 ms
+hadamard time: 40.465
+
+kernel execution time:  40.2328 ms
+gemm time: 40.7652
+
+kernel execution time:  770.504 ms
+taco reference time: 771.113
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  32.6037 ms
+fused time: 36.0777
+
+kernel execution time:  27.1815 ms
+hadamard time: 27.8676
+
+kernel execution time:  46.1458 ms
+gemm time: 46.6699
+
+kernel execution time:  97.8299 ms
+taco reference time: 98.5149
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  42.3414 ms
+fused time: 46.4717
+
+kernel execution time:  37.0604 ms
+hadamard time: 37.7717
+
+kernel execution time:  55.4753 ms
+gemm time: 56.0538
+
+kernel execution time:  129.339 ms
+taco reference time: 130.028
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 1000005, D2_dimension: 128, vals: 128000640
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  159.647 ms
+fused time: 164.344
+
+kernel execution time:  110.823 ms
+hadamard time: 111.516
+
+kernel execution time:  268.805 ms
+gemm time: 269.465
+
+kernel execution time:  326.437 ms
+taco reference time: 327.144
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 10974, D2_dimension: 128, vals: 1404672
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  80.3808 ms
+fused time: 82.9372
+
+kernel execution time:  17.8402 ms
+hadamard time: 18.4152
+
+kernel execution time:  127.495 ms
+gemm time: 128.275
+
+kernel execution time:  1763.16 ms
+taco reference time: 1763.78
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 36417, D2_dimension: 128, vals: 4661376
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  352.899 ms
+fused time: 356.76
+
+kernel execution time:  157.362 ms
+hadamard time: 157.893
+
+kernel execution time:  406.42 ms
+gemm time: 407.203
+
+kernel execution time:  17839.4 ms
+taco reference time: 17840.5
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 46835, D2_dimension: 128, vals: 5994880
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  360.403 ms
+fused time: 364.207
+
+kernel execution time:  92.7639 ms
+hadamard time: 93.2881
+
+kernel execution time:  519.132 ms
+gemm time: 519.668
+
+kernel execution time:  9767.06 ms
+taco reference time: 9767.66
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 62451, D2_dimension: 128, vals: 7993728
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  499.64 ms
+fused time: 503.449
+
+kernel execution time:  148.888 ms
+hadamard time: 149.416
+
+kernel execution time:  689.134 ms
+gemm time: 689.652
+
+kernel execution time:  16929 ms
+taco reference time: 16930
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 83334, D2_dimension: 128, vals: 10666752
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  690.556 ms
+fused time: 694.221
+
+kernel execution time:  230.454 ms
+hadamard time: 230.979
+
+kernel execution time:  922.831 ms
+gemm time: 923.322
+
+kernel execution time:  24781.4 ms
+taco reference time: 24782.4
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 121192, D2_dimension: 128, vals: 15512576
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  871.577 ms
+fused time: 876.166
+
+kernel execution time:  213.157 ms
+hadamard time: 213.706
+
+kernel execution time:  1342.88 ms
+gemm time: 1343.39
+
+kernel execution time:  10845 ms
+taco reference time: 10846.1
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  1074.54 ms
+fused time: 1078.91
+
+kernel execution time:  302.447 ms
+hadamard time: 302.972
+
+kernel execution time:  1560.59 ms
+gemm time: 1561.07
+
+kernel execution time:  32089.4 ms
+taco reference time: 32090.3
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  1034.29 ms
+fused time: 1037.96
+
+kernel execution time:  85.577 ms
+hadamard time: 86.1357
+
+kernel execution time:  1881.63 ms
+gemm time: 1882.13
+
+kernel execution time:  3962.92 ms
+taco reference time: 3963.97
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  1241.65 ms
+fused time: 1244.6
+
+kernel execution time:  87.8479 ms
+hadamard time: 88.3878
+
+kernel execution time:  2286.72 ms
+gemm time: 2287.22
+
+kernel execution time:  5303.69 ms
+taco reference time: 5304.69
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 1000005, D2_dimension: 128, vals: 128000640
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  5642.42 ms
+fused time: 5643.31
+
+kernel execution time:  264.874 ms
+hadamard time: 265.396
+
+kernel execution time:  10966.5 ms
+gemm time: 10967.4
+
+kernel execution time:  12863.7 ms
+taco reference time: 12864.8
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 5558326, D2_dimension: 128, vals: 711465728
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  35075.5 ms
+fused time: 35079.3
+
+kernel execution time:  3869.9 ms
+hadamard time: 3870.98
+
+kernel execution time:  61504.6 ms
+gemm time: 61505.4
+
+kernel execution time:  245613 ms
+taco reference time: 245614
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  18.3809 ms
+fused time: 19.1229
+
+kernel execution time:  0.635828 ms
+hadamard time: 0.983143
+
+kernel execution time:  30.5122 ms
+gemm time: 30.7819
+
+kernel execution time:  23.6746 ms
+taco reference time: 24.0784
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  3580.2 ms
+fused time: 3581
+
+kernel execution time:  567.762 ms
+hadamard time: 568.301
+
+kernel execution time:  6079.96 ms
+gemm time: 6080.46
+
+kernel execution time:  8129.78 ms
+taco reference time: 8130.38
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  18.4625 ms
+fused time: 19.1824
+
+kernel execution time:  0.520446 ms
+hadamard time: 0.824011
+
+kernel execution time:  30.2097 ms
+gemm time: 30.46
+
+kernel execution time:  23.4681 ms
+taco reference time: 23.826
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  3528.39 ms
+fused time: 3529.23
+
+kernel execution time:  558.625 ms
+hadamard time: 559.16
+
+kernel execution time:  6157.3 ms
+gemm time: 6158.14
+
+kernel execution time:  8131.73 ms
+taco reference time: 8132.69
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  2.27347 ms
+fused time: 2.7115
+
+kernel execution time:  0.180952 ms
+hadamard time: 0.76318
+
+kernel execution time:  2.72672 ms
+gemm time: 3.22211
+
+kernel execution time:  5.227 ms
+taco reference time: 5.75632
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  164.815 ms
+fused time: 165.539
+
+kernel execution time:  96.629 ms
+hadamard time: 97.303
+
+kernel execution time:  202.068 ms
+gemm time: 202.628
+
+kernel execution time:  273.96 ms
+taco reference time: 274.643
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  2.37004 ms
+fused time: 3.11591
+
+kernel execution time:  0.176612 ms
+hadamard time: 0.833621
+
+kernel execution time:  2.08823 ms
+gemm time: 2.59022
+
+kernel execution time:  3.36531 ms
+taco reference time: 4.11087
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 128, E2_dimension: 128, vals: 16384
+
+
+kernel execution time:  19.3307 ms
+fused time: 20.0662
+
+kernel execution time:  0.496176 ms
+hadamard time: 0.931803
+
+kernel execution time:  30.1194 ms
+gemm time: 30.3654
+
+kernel execution time:  23.3946 ms
+taco reference time: 23.7411
diff --git a/test/stats/mttkrp-spmm.txt b/test/stats/mttkrp-spmm.txt
new file mode 100644
index 000000000..fd6226179
--- /dev/null
+++ b/test/stats/mttkrp-spmm.txt
@@ -0,0 +1,1090 @@
+
+ mttkrp-spmm execution 
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 25, vals: 625
+D1_dimension: 25, D2_dimension: 25, vals: 625
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  0.03045 ms
+fused time: 0.870912
+
+kernel execution time:  0.168452 ms
+reference asymptotic blowup time: 0.983003
+
+kernel execution time:  0.015 ms
+mttkrp time: 0.493997
+
+kernel execution time:  0.0267 ms
+spmm time: 0.74405
+
+mttkrp-spmm execution
+
+0.015 0.0267 0.03045 0.168452
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181
+C1_dimension: 17262471, C2_dimension: 25, vals: 431561775
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  8190.76 ms
+fused time: 8191.78
+
+kernel execution time:  112801 ms
+reference asymptotic blowup time: 112802
+
+kernel execution time:  11198.5 ms
+mttkrp time: 11199.5
+
+kernel execution time:  238.88 ms
+spmm time: 239.385
+
+0.015 0.0267 0.03045 0.168452
+11198.5 238.88 8190.76 112801
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 28153045, C2_dimension: 25, vals: 703826125
+D1_dimension: 1607191, D2_dimension: 25, vals: 40179775
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  3951.18 ms
+fused time: 3952.21
+
+kernel execution time:  76964 ms
+reference asymptotic blowup time: 76965.1
+
+kernel execution time:  6212.97 ms
+mttkrp time: 6213.89
+
+kernel execution time:  142.233 ms
+spmm time: 142.726
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 25, vals: 229600
+D1_dimension: 28818, D2_dimension: 25, vals: 720450
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  997.696 ms
+fused time: 998.725
+
+kernel execution time:  55544.7 ms
+reference asymptotic blowup time: 55545.9
+
+kernel execution time:  1944.26 ms
+mttkrp time: 1944.75
+
+kernel execution time:  5.40774 ms
+spmm time: 5.8765
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 2143368, C2_dimension: 25, vals: 53584200
+D1_dimension: 25495389, D2_dimension: 25, vals: 637384725
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  16275.3 ms
+fused time: 16276.4
+
+kernel execution time:  325523 ms
+reference asymptotic blowup time: 325525
+
+kernel execution time:  29202.5 ms
+mttkrp time: 29203.5
+
+kernel execution time:  1240.14 ms
+spmm time: 1240.66
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 11374, C2_dimension: 25, vals: 284350
+D1_dimension: 2, D2_dimension: 25, vals: 50
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  400.942 ms
+fused time: 401.47
+
+kernel execution time:  21565.2 ms
+reference asymptotic blowup time: 21566.3
+
+kernel execution time:  1292.53 ms
+mttkrp time: 1293.05
+
+kernel execution time:  72.2856 ms
+spmm time: 72.8001
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 25, vals: 561900
+D1_dimension: 23776223, D2_dimension: 25, vals: 594405575
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  1397.54 ms
+fused time: 1398.54
+
+kernel execution time:  39690 ms
+reference asymptotic blowup time: 39691
+
+kernel execution time:  4004.71 ms
+mttkrp time: 4005.68
+
+kernel execution time:  7.97584 ms
+spmm time: 8.44535
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550
+C1_dimension: 23344784, C2_dimension: 25, vals: 583619600
+D1_dimension: 166, D2_dimension: 25, vals: 4150
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  15804.8 ms
+fused time: 15805.9
+
+kernel execution time:  79175 ms
+reference asymptotic blowup time: 79176.1
+
+kernel execution time:  10624.7 ms
+mttkrp time: 10625.6
+
+kernel execution time:  10007.2 ms
+spmm time: 10008.2
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770
+C1_dimension: 38955429, C2_dimension: 25, vals: 973885725
+D1_dimension: 532, D2_dimension: 25, vals: 13300
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  23869.4 ms
+fused time: 23870.5
+
+kernel execution time:  113144 ms
+reference asymptotic blowup time: 113145
+
+kernel execution time:  15284.7 ms
+mttkrp time: 15285.7
+
+kernel execution time:  15154.3 ms
+spmm time: 15155.6
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 25, vals: 625
+D1_dimension: 25, D2_dimension: 25, vals: 625
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  0.043711 ms
+fused time: 0.864271
+
+kernel execution time:  0.027391 ms
+mttkrp time: 0.889931
+
+kernel execution time:  0.02264 ms
+spmm time: 1.09649
+
+kernel execution time:  0.04233 ms
+reference asymptotic blowup time: 1.01915
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 25, vals: 229600
+D1_dimension: 28818, D2_dimension: 25, vals: 720450
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  813.743 ms
+fused time: 814.267
+
+kernel execution time:  458.835 ms
+mttkrp time: 459.4
+
+kernel execution time:  3.56961 ms
+spmm time: 4.08913
+
+kernel execution time:  13803.8 ms
+reference asymptotic blowup time: 13804.8
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 25, vals: 229600
+D1_dimension: 28818, D2_dimension: 25, vals: 720450
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  224.386 ms
+fused time: 224.986
+
+kernel execution time:  101.692 ms
+mttkrp time: 102.264
+
+kernel execution time:  5.95563 ms
+spmm time: 6.44162
+
+kernel execution time:  2647.79 ms
+reference asymptotic blowup time: 2648.57
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 25, vals: 561900
+D1_dimension: 23776223, D2_dimension: 25, vals: 594405575
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  208.602 ms
+fused time: 209.122
+
+kernel execution time:  631.37 ms
+mttkrp time: 631.981
+
+kernel execution time:  7.20919 ms
+spmm time: 7.81651
+
+kernel execution time:  6749.05 ms
+reference asymptotic blowup time: 6750.17
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 11374, C2_dimension: 25, vals: 284350
+D1_dimension: 2, D2_dimension: 25, vals: 50
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  95.6907 ms
+fused time: 96.2212
+
+kernel execution time:  59.1475 ms
+mttkrp time: 59.7153
+
+kernel execution time:  63.6734 ms
+spmm time: 64.1704
+
+kernel execution time:  884.275 ms
+reference asymptotic blowup time: 884.934
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 25, vals: 229600
+D1_dimension: 28818, D2_dimension: 25, vals: 720450
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  225.843 ms
+fused time: 226.345
+
+kernel execution time:  100.14 ms
+mttkrp time: 100.738
+
+kernel execution time:  6.32395 ms
+spmm time: 6.85452
+
+kernel execution time:  2678.56 ms
+reference asymptotic blowup time: 2679.35
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 28153045, C2_dimension: 25, vals: 703826125
+D1_dimension: 1607191, D2_dimension: 25, vals: 40179775
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  503.61 ms
+fused time: 504.129
+
+kernel execution time:  314.899 ms
+mttkrp time: 315.501
+
+kernel execution time:  125.456 ms
+spmm time: 125.953
+
+kernel execution time:  3415.65 ms
+reference asymptotic blowup time: 3416.62
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 2143368, C2_dimension: 25, vals: 53584200
+D1_dimension: 25495389, D2_dimension: 25, vals: 637384725
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  1501.57 ms
+fused time: 1502.59
+
+kernel execution time:  1748.65 ms
+mttkrp time: 1749.21
+
+kernel execution time:  1135.01 ms
+spmm time: 1135.51
+
+kernel execution time:  16178.4 ms
+reference asymptotic blowup time: 16179.5
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 2143368, C2_dimension: 25, vals: 53584200
+D1_dimension: 25495389, D2_dimension: 25, vals: 637384725
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  16005.7 ms
+fused time: 16006.6
+
+kernel execution time:  29157.8 ms
+mttkrp time: 29158.8
+
+kernel execution time:  1247.23 ms
+spmm time: 1247.75
+
+kernel execution time:  329124 ms
+reference asymptotic blowup time: 329125
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 25, vals: 229600
+D1_dimension: 28818, D2_dimension: 25, vals: 720450
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+kernel execution time:  2651.26 ms
+reference asymptotic blowup time: 2652.08
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165
+C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050
+D1_dimension: 2480308, D2_dimension: 25, vals: 62007700
+E1_dimension: 25, E2_dimension: 48, vals: 1200
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.286814 ms
+reference asymptotic blowup time: 1.00956
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.036661 ms
+mttkrp time: 0.77391
+
+kernel execution time:  0.02948 ms
+mttkrp ryan time: 0.932103
+
+kernel execution time:  0.264104 ms
+reference asymptotic blowup time: 1.32301
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.04003 ms
+mttkrp time: 0.779201
+
+kernel execution time:  0.022291 ms
+mttkrp ryan time: 0.821601
+
+kernel execution time:  0.268404 ms
+reference asymptotic blowup time: 1.28741
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.03006 ms
+default mttkrp time: 0.641369
+
+kernel execution time:  0.023191 ms
+ryan mttkrp workspace time: 0.982223
+
+kernel execution time:  0.084371 ms
+spmm time: 0.944412
+
+kernel execution time:  0.262723 ms
+reference asymptotic blowup time: 0.927732
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.046181 ms
+default mttkrp time: 0.459706
+
+kernel execution time:  0.076311 ms
+ryan mttkrp workspace time: 1.1076
+
+kernel execution time:  0.06528 ms
+GeMM time: 0.307835
+
+kernel execution time:  0.230713 ms
+reference asymptotic blowup time: 0.942012
+
+kernel execution time:  0.081741 ms
+fused mttkrp+gemm time: 0.885412
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 28153045, C2_dimension: 32, vals: 900897440
+D1_dimension: 1607191, D2_dimension: 32, vals: 51430112
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  233.898 ms
+default mttkrp time: 234.426
+
+kernel execution time:  293.46 ms
+ryan mttkrp workspace time: 294.21
+
+kernel execution time:  23.4947 ms
+GeMM time: 24.009
+
+kernel execution time:  2753.37 ms
+reference asymptotic blowup time: 2754.12
+
+kernel execution time:  287.939 ms
+fused mttkrp+gemm time: 288.576
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 32, vals: 293888
+D1_dimension: 28818, D2_dimension: 32, vals: 922176
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  140.989 ms
+default mttkrp time: 141.517
+
+kernel execution time:  36.4285 ms
+ryan mttkrp workspace time: 37.0544
+
+kernel execution time:  1.06091 ms
+GeMM time: 1.6425
+
+kernel execution time:  3142.38 ms
+reference asymptotic blowup time: 3143.28
+
+kernel execution time:  43.1867 ms
+fused mttkrp+gemm time: 43.8393
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 2143368, C2_dimension: 32, vals: 68587776
+D1_dimension: 25495389, D2_dimension: 32, vals: 815852448
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  2635.67 ms
+default mttkrp time: 2636.7
+
+kernel execution time:  913.661 ms
+ryan mttkrp workspace time: 914.435
+
+kernel execution time:  166.615 ms
+GeMM time: 167.532
+
+kernel execution time:  39080.1 ms
+reference asymptotic blowup time: 39080.8
+
+kernel execution time:  1141.77 ms
+fused mttkrp+gemm time: 1142.88
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 11374, C2_dimension: 32, vals: 363968
+D1_dimension: 2, D2_dimension: 32, vals: 64
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  38.5561 ms
+default mttkrp time: 39.0876
+
+kernel execution time:  18.0733 ms
+ryan mttkrp workspace time: 18.6685
+
+kernel execution time:  9.91856 ms
+GeMM time: 10.4003
+
+kernel execution time:  663.996 ms
+reference asymptotic blowup time: 664.529
+
+kernel execution time:  15.476 ms
+fused mttkrp+gemm time: 16.1515
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 32, vals: 719232
+D1_dimension: 23776223, D2_dimension: 32, vals: 760839136
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  893.657 ms
+default mttkrp time: 894.664
+
+kernel execution time:  228.227 ms
+ryan mttkrp workspace time: 228.852
+
+kernel execution time:  1.81839 ms
+GeMM time: 2.27454
+
+kernel execution time:  13301.8 ms
+reference asymptotic blowup time: 13302.7
+
+kernel execution time:  238.142 ms
+fused mttkrp+gemm time: 238.778
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164
+C1_dimension: 55584242, C2_dimension: 32, vals: 1778695744
+D1_dimension: 2480308, D2_dimension: 32, vals: 79369856
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 28153045, C2_dimension: 32, vals: 900897440
+D1_dimension: 1607191, D2_dimension: 32, vals: 51430112
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  6303 ms
+default mttkrp time: 6303.86
+
+kernel execution time:  4378.98 ms
+ryan mttkrp workspace time: 4380.07
+
+kernel execution time:  449.512 ms
+GeMM time: 450.037
+
+kernel execution time:  116274 ms
+reference asymptotic blowup time: 116275
+
+kernel execution time:  4299.26 ms
+fused mttkrp+gemm time: 4300.33
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 32, vals: 293888
+D1_dimension: 28818, D2_dimension: 32, vals: 922176
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  2606.24 ms
+default mttkrp time: 2607.1
+
+kernel execution time:  878.486 ms
+ryan mttkrp workspace time: 879.009
+
+kernel execution time:  17.5967 ms
+GeMM time: 18.0274
+
+kernel execution time:  93762.9 ms
+reference asymptotic blowup time: 93763.7
+
+kernel execution time:  1052.15 ms
+fused mttkrp+gemm time: 1052.76
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 2143368, C2_dimension: 32, vals: 68587776
+D1_dimension: 25495389, D2_dimension: 32, vals: 815852448
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  36869.4 ms
+default mttkrp time: 36870.3
+
+kernel execution time:  17566.6 ms
+ryan mttkrp workspace time: 17567.6
+
+kernel execution time:  4060.98 ms
+GeMM time: 4061.93
+
+kernel execution time:  720483 ms
+reference asymptotic blowup time: 720484
+
+kernel execution time:  17354.7 ms
+fused mttkrp+gemm time: 17355.9
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 11374, C2_dimension: 32, vals: 363968
+D1_dimension: 2, D2_dimension: 32, vals: 64
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1680.94 ms
+default mttkrp time: 1681.8
+
+kernel execution time:  615.002 ms
+ryan mttkrp workspace time: 615.585
+
+kernel execution time:  231.923 ms
+GeMM time: 232.453
+
+kernel execution time:  28415.3 ms
+reference asymptotic blowup time: 28416.4
+
+kernel execution time:  453.141 ms
+fused mttkrp+gemm time: 453.827
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 32, vals: 719232
+D1_dimension: 23776223, D2_dimension: 32, vals: 760839136
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  4430.77 ms
+default mttkrp time: 4431.71
+
+kernel execution time:  1465.2 ms
+ryan mttkrp workspace time: 1465.77
+
+kernel execution time:  32.1871 ms
+GeMM time: 32.6436
+
+kernel execution time:  71199.8 ms
+reference asymptotic blowup time: 71200.9
+
+kernel execution time:  1570.11 ms
+fused mttkrp+gemm time: 1570.76
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 32, vals: 719232
+D1_dimension: 23776223, D2_dimension: 32, vals: 760839136
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  882.674 ms
+default mttkrp time: 883.69
+
+kernel execution time:  231.925 ms
+ryan mttkrp workspace time: 232.94
+
+kernel execution time:  1.87878 ms
+GeMM time: 2.38818
+
+kernel execution time:  13018.7 ms
+reference asymptotic blowup time: 13019.7
+
+kernel execution time:  227.495 ms
+fused mttkrp+gemm time: 228.182
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 22476, C2_dimension: 32, vals: 719232
+D1_dimension: 23776223, D2_dimension: 32, vals: 760839136
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  874.742 ms
+default mttkrp time: 875.218
+
+kernel execution time:  231.556 ms
+ryan mttkrp workspace time: 232.223
+
+kernel execution time:  1.7427 ms
+GeMM time: 2.19512
+
+kernel execution time:  13047.8 ms
+reference asymptotic blowup time: 13048.7
+
+kernel execution time:  232.174 ms
+fused mttkrp+gemm time: 232.85
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 11374, C2_dimension: 32, vals: 363968
+D1_dimension: 2, D2_dimension: 32, vals: 64
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  40.9013 ms
+default mttkrp time: 41.4712
+
+kernel execution time:  18.9468 ms
+ryan mttkrp workspace time: 19.5875
+
+kernel execution time:  10.8838 ms
+GeMM time: 11.3865
+
+kernel execution time:  700.825 ms
+reference asymptotic blowup time: 701.445
+
+kernel execution time:  15.8743 ms
+fused mttkrp+gemm time: 16.5313
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.02019 ms
+default mttkrp time: 3.8105
+
+kernel execution time:  0.01628 ms
+ryan mttkrp workspace time: 0.602618
+
+kernel execution time:  0.075521 ms
+GeMM time: 0.491146
+
+kernel execution time:  0.254864 ms
+reference asymptotic blowup time: 0.897372
+
+kernel execution time:  0.038201 ms
+fused mttkrp+gemm time: 4.54224
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.02015 ms
+default mttkrp time: 3.93207
+
+kernel execution time:  0.015561 ms
+ryan mttkrp workspace time: 0.559818
+
+kernel execution time:  0.074741 ms
+GeMM time: 0.880342
+
+kernel execution time:  0.250803 ms
+reference asymptotic blowup time: 0.892052
+
+kernel execution time:  0.038071 ms
+fused mttkrp+gemm time: 3.0867
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 25, D2_dimension: 32, vals: 800
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.02689 ms
+default mttkrp time: 0.73934
+
+kernel execution time:  0.02205 ms
+ryan mttkrp workspace time: 0.863852
+
+kernel execution time:  0.081811 ms
+GeMM time: 0.527658
+
+kernel execution time:  0.259993 ms
+reference asymptotic blowup time: 0.923212
+
+kernel execution time:  0.042261 ms
+fused mttkrp+gemm time: 0.703349
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 32, vals: 293888
+D1_dimension: 28818, D2_dimension: 32, vals: 922176
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  141.637 ms
+default mttkrp time: 142.17
+
+kernel execution time:  41.1194 ms
+ryan mttkrp workspace time: 41.7838
+
+kernel execution time:  1.06942 ms
+GeMM time: 1.50588
+
+kernel execution time:  3218.72 ms
+reference asymptotic blowup time: 3219.51
+
+kernel execution time:  145.235 ms
+fused mttkrp+gemm time: 145.866
+
+mttkrp-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 9184, C2_dimension: 32, vals: 293888
+D1_dimension: 28818, D2_dimension: 32, vals: 922176
+E1_dimension: 32, E2_dimension: 64, vals: 2048
+
+
+kernel execution time:  148.092 ms
+default mttkrp time: 148.691
+
+kernel execution time:  41.3947 ms
+ryan mttkrp workspace time: 42.046
+
+kernel execution time:  1.03445 ms
+GeMM time: 1.45556
+
+kernel execution time:  3211.6 ms
+reference asymptotic blowup time: 3212.43
+
+kernel execution time:  45.5971 ms
+fused mttkrp+gemm time: 46.2057
diff --git a/test/stats/sddmm-spmm-gemm.txt b/test/stats/sddmm-spmm-gemm.txt
new file mode 100644
index 000000000..02665478f
--- /dev/null
+++ b/test/stats/sddmm-spmm-gemm.txt
@@ -0,0 +1,1471 @@
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+G1_dimension: 10974, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  2.51139 ms
+fused time: 3.49403
+
+kernel execution time:  3.80634 ms
+sddmm time: 4.13132
+
+kernel execution time:  0.75853 ms
+sddmm ryan time: 1.07946
+
+kernel execution time:  0.968473 ms
+spmm ryan time: 1.2051
+
+kernel execution time:  1.39879 ms
+gemm time: 1.6602
+
+kernel execution time:  1070.79 ms
+taco reference time: 1071.2
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+G1_dimension: 36417, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  8.43361 ms
+fused time: 9.03941
+
+kernel execution time:  13.3195 ms
+sddmm time: 13.9487
+
+kernel execution time:  4.73639 ms
+sddmm ryan time: 5.32202
+
+kernel execution time:  4.735 ms
+spmm ryan time: 5.22103
+
+kernel execution time:  3.66798 ms
+gemm time: 4.15167
+
+kernel execution time:  10658.4 ms
+taco reference time: 10659.3
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+G1_dimension: 46835, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  7.54896 ms
+fused time: 8.15687
+
+kernel execution time:  15.1277 ms
+sddmm time: 15.796
+
+kernel execution time:  3.51464 ms
+sddmm ryan time: 4.10653
+
+kernel execution time:  4.21975 ms
+spmm ryan time: 4.6923
+
+kernel execution time:  4.74088 ms
+gemm time: 5.2156
+
+kernel execution time:  5949.54 ms
+taco reference time: 5950.52
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+G1_dimension: 62451, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  11.7188 ms
+fused time: 12.3427
+
+kernel execution time:  18.5962 ms
+sddmm time: 19.2831
+
+kernel execution time:  6.5821 ms
+sddmm ryan time: 7.20737
+
+kernel execution time:  6.6327 ms
+spmm ryan time: 7.20703
+
+kernel execution time:  6.06003 ms
+gemm time: 6.61794
+
+kernel execution time:  9765.93 ms
+taco reference time: 9766.85
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+G1_dimension: 83334, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  16.3022 ms
+fused time: 16.877
+
+kernel execution time:  26.4065 ms
+sddmm time: 26.9999
+
+kernel execution time:  9.6103 ms
+sddmm ryan time: 10.1859
+
+kernel execution time:  9.5796 ms
+spmm ryan time: 10.139
+
+kernel execution time:  7.75909 ms
+gemm time: 8.27337
+
+kernel execution time:  14674.3 ms
+taco reference time: 14675.2
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+G1_dimension: 121192, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  28.3919 ms
+fused time: 29.022
+
+kernel execution time:  28.7666 ms
+sddmm time: 29.4282
+
+kernel execution time:  10.9353 ms
+sddmm ryan time: 11.5639
+
+kernel execution time:  12.2792 ms
+spmm ryan time: 12.86
+
+kernel execution time:  12.0463 ms
+gemm time: 12.6219
+
+kernel execution time:  6496.16 ms
+taco reference time: 6497.16
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+G1_dimension: 140874, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  23.8673 ms
+fused time: 24.4851
+
+kernel execution time:  38.4245 ms
+sddmm time: 39.0808
+
+kernel execution time:  13.3169 ms
+sddmm ryan time: 13.9402
+
+kernel execution time:  13.8214 ms
+spmm ryan time: 14.3969
+
+kernel execution time:  13.3955 ms
+gemm time: 14.0084
+
+kernel execution time:  19010.9 ms
+taco reference time: 19012
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+G1_dimension: 170998, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  19.1593 ms
+fused time: 19.7496
+
+kernel execution time:  31.0395 ms
+sddmm time: 31.6882
+
+kernel execution time:  7.35776 ms
+sddmm ryan time: 7.96434
+
+kernel execution time:  9.33589 ms
+spmm ryan time: 9.89731
+
+kernel execution time:  16.4733 ms
+gemm time: 17.0352
+
+kernel execution time:  2397 ms
+taco reference time: 2397.64
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+G1_dimension: 1000005, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  66.7468 ms
+fused time: 67.289
+
+kernel execution time:  69.5837 ms
+sddmm time: 70.1602
+
+kernel execution time:  23.2899 ms
+sddmm ryan time: 23.8277
+
+kernel execution time:  41.9566 ms
+spmm ryan time: 42.5095
+
+kernel execution time:  93.8383 ms
+gemm time: 94.3738
+
+kernel execution time:  7587.7 ms
+taco reference time: 7588.87
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+G1_dimension: 5558326, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  688.492 ms
+fused time: 689.478
+
+kernel execution time:  979.86 ms
+sddmm time: 980.45
+
+kernel execution time:  318.248 ms
+sddmm ryan time: 318.831
+
+kernel execution time:  449.669 ms
+spmm ryan time: 450.215
+
+kernel execution time:  503.695 ms
+gemm time: 504.291
+
+kernel execution time:  326798 ms
+taco reference time: 326799
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+G1_dimension: 5558326, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  9624.7 ms
+fused time: 9625.73
+
+kernel execution time:  1635.76 ms
+sddmm time: 1636.3
+
+kernel execution time:  1636.41 ms
+sddmm ryan time: 1636.96
+
+kernel execution time:  2930.01 ms
+spmm ryan time: 2930.5
+
+kernel execution time:  15204.2 ms
+gemm time: 15205.2
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+G1_dimension: 10974, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  31.0958 ms
+fused time: 31.6403
+
+kernel execution time:  9.52362 ms
+sddmm time: 10.0411
+
+kernel execution time:  9.50283 ms
+sddmm ryan time: 9.98181
+
+kernel execution time:  9.9883 ms
+spmm ryan time: 10.3927
+
+kernel execution time:  30.6724 ms
+gemm time: 31.0956
+
+kernel execution time:  50903.4 ms
+taco reference time: 50904.4
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+G1_dimension: 36417, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  221.251 ms
+fused time: 223.31
+
+kernel execution time:  90.6291 ms
+sddmm time: 91.9017
+
+kernel execution time:  92.6299 ms
+sddmm ryan time: 93.1693
+
+kernel execution time:  70.0109 ms
+spmm ryan time: 70.4884
+
+kernel execution time:  103.984 ms
+gemm time: 105.217
+
+kernel execution time:  441848 ms
+taco reference time: 441849
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+G1_dimension: 46835, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  156.706 ms
+fused time: 158.878
+
+kernel execution time:  53.3541 ms
+sddmm time: 53.8804
+
+kernel execution time:  53.6128 ms
+sddmm ryan time: 54.7942
+
+kernel execution time:  51.5253 ms
+spmm ryan time: 52.5961
+
+kernel execution time:  130.147 ms
+gemm time: 131.306
+
+kernel execution time:  243737 ms
+taco reference time: 243739
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+G1_dimension: 62451, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  238.619 ms
+fused time: 240.152
+
+kernel execution time:  84.8828 ms
+sddmm time: 85.4286
+
+kernel execution time:  80.7058 ms
+sddmm ryan time: 81.2588
+
+kernel execution time:  75.2549 ms
+spmm ryan time: 75.7338
+
+kernel execution time:  174.145 ms
+gemm time: 174.654
+
+kernel execution time:  412699 ms
+taco reference time: 412701
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+G1_dimension: 83334, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  350.004 ms
+fused time: 351.319
+
+kernel execution time:  123.574 ms
+sddmm time: 124.101
+
+kernel execution time:  126.113 ms
+sddmm ryan time: 127.971
+
+kernel execution time:  113.146 ms
+spmm ryan time: 113.615
+
+kernel execution time:  234.287 ms
+gemm time: 235.546
+
+kernel execution time:  619783 ms
+taco reference time: 619784
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+G1_dimension: 121192, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  335.548 ms
+fused time: 337.292
+
+kernel execution time:  90.8795 ms
+sddmm time: 91.3981
+
+kernel execution time:  87.7678 ms
+sddmm ryan time: 88.2879
+
+kernel execution time:  111.725 ms
+spmm ryan time: 113.063
+
+kernel execution time:  338.451 ms
+gemm time: 340.2
+
+kernel execution time:  268303 ms
+taco reference time: 268304
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+G1_dimension: 140874, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  488.065 ms
+fused time: 489.312
+
+kernel execution time:  161.434 ms
+sddmm time: 163.199
+
+kernel execution time:  164.295 ms
+sddmm ryan time: 165.567
+
+kernel execution time:  154.131 ms
+spmm ryan time: 154.61
+
+kernel execution time:  391.972 ms
+gemm time: 393.242
+
+kernel execution time:  798245 ms
+taco reference time: 798247
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+G1_dimension: 170998, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  279.308 ms
+fused time: 280.422
+
+kernel execution time:  41.2598 ms
+sddmm time: 41.7727
+
+kernel execution time:  40.3132 ms
+sddmm ryan time: 40.882
+
+kernel execution time:  72.4795 ms
+spmm ryan time: 73.6321
+
+kernel execution time:  473.298 ms
+gemm time: 474.582
+
+kernel execution time:  98095.7 ms
+taco reference time: 98098.4
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+G1_dimension: 206500, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  321.827 ms
+fused time: 322.725
+
+kernel execution time:  43.7794 ms
+sddmm time: 44.8964
+
+kernel execution time:  42.531 ms
+sddmm ryan time: 43.7502
+
+kernel execution time:  83.5305 ms
+spmm ryan time: 84.0178
+
+kernel execution time:  567.368 ms
+gemm time: 567.876
+
+kernel execution time:  130204 ms
+taco reference time: 130207
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+G1_dimension: 1000005, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1355.72 ms
+fused time: 1357.14
+
+kernel execution time:  98.94 ms
+sddmm time: 101.488
+
+kernel execution time:  97.8972 ms
+sddmm ryan time: 98.4423
+
+kernel execution time:  218.188 ms
+spmm ryan time: 219.39
+
+kernel execution time:  2744.38 ms
+gemm time: 2744.89
+
+kernel execution time:  320035 ms
+taco reference time: 320037
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+G1_dimension: 5558326, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  9682.48 ms
+fused time: 9684.45
+
+kernel execution time:  1640.01 ms
+sddmm time: 1641.3
+
+kernel execution time:  1626.66 ms
+sddmm ryan time: 1628.12
+
+kernel execution time:  2908.47 ms
+spmm ryan time: 2908.94
+
+kernel execution time:  15252.4 ms
+gemm time: 15253.4
+
+kernel execution time:  6.11703e+06 ms
+taco reference time: 6.11703e+06
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+G1_dimension: 46835, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  7.90719 ms
+fused time: 12.4475
+
+kernel execution time:  15.0235 ms
+sddmm time: 18.4078
+
+kernel execution time:  3.60187 ms
+sddmm ryan time: 7.64096
+
+kernel execution time:  4.26585 ms
+spmm ryan time: 7.23736
+
+kernel execution time:  5.51232 ms
+gemm time: 8.94274
+
+kernel execution time:  5900.92 ms
+taco reference time: 5901.77
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+G1_dimension: 62451, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  11.9944 ms
+fused time: 15.5065
+
+kernel execution time:  17.5788 ms
+sddmm time: 18.2088
+
+kernel execution time:  6.90362 ms
+sddmm ryan time: 9.18146
+
+kernel execution time:  6.52502 ms
+spmm ryan time: 7.08577
+
+kernel execution time:  5.70869 ms
+gemm time: 6.23327
+
+kernel execution time:  9752.35 ms
+taco reference time: 9753.37
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+G1_dimension: 83334, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  16.1703 ms
+fused time: 19.9224
+
+kernel execution time:  26.3346 ms
+sddmm time: 30.1538
+
+kernel execution time:  9.47197 ms
+sddmm ryan time: 12.7137
+
+kernel execution time:  9.14926 ms
+spmm ryan time: 9.78178
+
+kernel execution time:  8.06171 ms
+gemm time: 8.592
+
+kernel execution time:  14612.6 ms
+taco reference time: 14617.7
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+G1_dimension: 121192, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  28.2581 ms
+fused time: 32.7167
+
+kernel execution time:  30.162 ms
+sddmm time: 33.8587
+
+kernel execution time:  11.0142 ms
+sddmm ryan time: 15.2742
+
+kernel execution time:  12.1744 ms
+spmm ryan time: 15.0065
+
+kernel execution time:  11.4579 ms
+gemm time: 14.5527
+
+kernel execution time:  6379.22 ms
+taco reference time: 6380.3
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+G1_dimension: 140874, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  24.3937 ms
+fused time: 28.6422
+
+kernel execution time:  37.2457 ms
+sddmm time: 41.311
+
+kernel execution time:  13.8503 ms
+sddmm ryan time: 17.9583
+
+kernel execution time:  14.2713 ms
+spmm ryan time: 17.1402
+
+kernel execution time:  13.6024 ms
+gemm time: 16.6078
+
+kernel execution time:  18993.5 ms
+taco reference time: 18994.5
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+G1_dimension: 170998, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  18.4645 ms
+fused time: 22.0711
+
+kernel execution time:  31.6844 ms
+sddmm time: 34.9774
+
+kernel execution time:  7.19931 ms
+sddmm ryan time: 11.584
+
+kernel execution time:  9.40139 ms
+spmm ryan time: 10.002
+
+kernel execution time:  16.3933 ms
+gemm time: 19.0699
+
+kernel execution time:  2325.51 ms
+taco reference time: 2326.19
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+G1_dimension: 206500, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  25.9398 ms
+fused time: 30.7713
+
+kernel execution time:  43.1619 ms
+sddmm time: 47.1566
+
+kernel execution time:  9.47076 ms
+sddmm ryan time: 12.9736
+
+kernel execution time:  12.1315 ms
+spmm ryan time: 12.7125
+
+kernel execution time:  19.8795 ms
+gemm time: 23.9233
+
+kernel execution time:  3085.34 ms
+taco reference time: 3087.4
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+G1_dimension: 1000005, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  68.9391 ms
+fused time: 73.2143
+
+kernel execution time:  68.0597 ms
+sddmm time: 71.8136
+
+kernel execution time:  23.658 ms
+sddmm ryan time: 27.2015
+
+kernel execution time:  42.2166 ms
+spmm ryan time: 45.3816
+
+kernel execution time:  91.7085 ms
+gemm time: 94.965
+
+kernel execution time:  7504.53 ms
+taco reference time: 7510.21
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+G1_dimension: 5558326, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  685.25 ms
+fused time: 691.004
+
+kernel execution time:  978.107 ms
+sddmm time: 982.105
+
+kernel execution time:  314.889 ms
+sddmm ryan time: 319.437
+
+kernel execution time:  451.321 ms
+spmm ryan time: 454.339
+
+kernel execution time:  511.771 ms
+gemm time: 516.049
+
+kernel execution time:  324954 ms
+taco reference time: 324960
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+G1_dimension: 10974, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  2.03017 ms
+fused time: 6.89988
+
+kernel execution time:  4.23176 ms
+sddmm time: 4.56628
+
+kernel execution time:  1.07066 ms
+sddmm ryan time: 1.60331
+
+kernel execution time:  1.04047 ms
+spmm ryan time: 1.84411
+
+kernel execution time:  1.58419 ms
+gemm time: 3.49011
+
+kernel execution time:  1168.5 ms
+taco reference time: 1172.82
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+G1_dimension: 36417, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  8.02954 ms
+fused time: 12.4005
+
+kernel execution time:  12.7753 ms
+sddmm time: 15.6047
+
+kernel execution time:  4.73627 ms
+sddmm ryan time: 8.24994
+
+kernel execution time:  4.90489 ms
+spmm ryan time: 5.40766
+
+kernel execution time:  2.99487 ms
+gemm time: 3.53289
+
+kernel execution time:  10658.1 ms
+taco reference time: 10661.2
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+G1_dimension: 46835, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  7.15818 ms
+fused time: 11.6143
+
+kernel execution time:  15.0391 ms
+sddmm time: 18.5456
+
+kernel execution time:  3.33442 ms
+sddmm ryan time: 6.94621
+
+kernel execution time:  4.13895 ms
+spmm ryan time: 7.49526
+
+kernel execution time:  3.79939 ms
+gemm time: 4.19085
+
+kernel execution time:  5801.87 ms
+taco reference time: 5803.1
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+G1_dimension: 62451, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  12.0771 ms
+fused time: 16.6939
+
+kernel execution time:  17.5697 ms
+sddmm time: 18.7919
+
+kernel execution time:  6.94731 ms
+sddmm ryan time: 11.0254
+
+kernel execution time:  7.03752 ms
+spmm ryan time: 8.55729
+
+kernel execution time:  5.18056 ms
+gemm time: 8.22984
+
+kernel execution time:  9735.41 ms
+taco reference time: 9737.5
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+G1_dimension: 83334, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  16.2173 ms
+fused time: 20.4628
+
+kernel execution time:  26.5883 ms
+sddmm time: 30.2732
+
+kernel execution time:  9.67928 ms
+sddmm ryan time: 13.4002
+
+kernel execution time:  9.46597 ms
+spmm ryan time: 12.3215
+
+kernel execution time:  6.14851 ms
+gemm time: 6.79689
+
+kernel execution time:  14647.4 ms
+taco reference time: 14648.9
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+G1_dimension: 121192, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  28.0895 ms
+fused time: 33.0632
+
+kernel execution time:  29.4447 ms
+sddmm time: 33.2669
+
+kernel execution time:  10.992 ms
+sddmm ryan time: 15.1462
+
+kernel execution time:  12.2197 ms
+spmm ryan time: 14.8823
+
+kernel execution time:  9.1576 ms
+gemm time: 12.476
+
+kernel execution time:  6388.6 ms
+taco reference time: 6389.71
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+G1_dimension: 140874, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  24.4023 ms
+fused time: 28.7813
+
+kernel execution time:  37.3163 ms
+sddmm time: 41.2616
+
+kernel execution time:  13.8084 ms
+sddmm ryan time: 17.1208
+
+kernel execution time:  14.1626 ms
+spmm ryan time: 17.3487
+
+kernel execution time:  10.2461 ms
+gemm time: 10.8026
+
+kernel execution time:  19008 ms
+taco reference time: 19013
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+G1_dimension: 170998, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  18.5328 ms
+fused time: 21.8578
+
+kernel execution time:  29.8727 ms
+sddmm time: 32.6967
+
+kernel execution time:  7.1244 ms
+sddmm ryan time: 10.2857
+
+kernel execution time:  8.9243 ms
+spmm ryan time: 9.54503
+
+kernel execution time:  12.6159 ms
+gemm time: 13.2038
+
+kernel execution time:  2326 ms
+taco reference time: 2326.66
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+G1_dimension: 206500, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  25.7525 ms
+fused time: 27.0427
+
+kernel execution time:  40.701 ms
+sddmm time: 44.8629
+
+kernel execution time:  9.61808 ms
+sddmm ryan time: 13.4076
+
+kernel execution time:  12.4322 ms
+spmm ryan time: 15.2811
+
+kernel execution time:  15.1033 ms
+gemm time: 17.9102
+
+kernel execution time:  3091.33 ms
+taco reference time: 3092.53
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+G1_dimension: 1000005, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  68.4469 ms
+fused time: 72.7982
+
+kernel execution time:  52.1276 ms
+sddmm time: 56.0577
+
+kernel execution time:  23.4796 ms
+sddmm ryan time: 27.0851
+
+kernel execution time:  42.2008 ms
+spmm ryan time: 45.2618
+
+kernel execution time:  74.1167 ms
+gemm time: 78.5888
+
+kernel execution time:  7502.71 ms
+taco reference time: 7508.45
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+G1_dimension: 5558326, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  684.483 ms
+fused time: 689.124
+
+kernel execution time:  889.925 ms
+sddmm time: 894.03
+
+kernel execution time:  315.322 ms
+sddmm ryan time: 319.629
+
+kernel execution time:  449.91 ms
+spmm ryan time: 453.686
+
+kernel execution time:  417.449 ms
+gemm time: 421.26
+
+kernel execution time:  326305 ms
+taco reference time: 326311
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  5.08607 ms
+fused time: 5.61989
+
+kernel execution time:  0.557608 ms
+sddmm time: 0.871642
+
+kernel execution time:  0.465526 ms
+sddmm ryan time: 0.7713
+
+kernel execution time:  0.498686 ms
+spmm ryan time: 0.739309
+
+kernel execution time:  0.7957 ms
+gemm time: 1.05919
+
+kernel execution time:  42.447 ms
+taco reference time: 42.885
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 64, vals: 35107264
+D1_dimension: 548551, D2_dimension: 64, vals: 35107264
+E1_dimension: 548551, E2_dimension: 64, vals: 35107264
+G1_dimension: 548551, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  89.9099 ms
+fused time: 90.5117
+
+kernel execution time:  29.9086 ms
+sddmm time: 30.4936
+
+kernel execution time:  29.1529 ms
+sddmm ryan time: 29.7063
+
+kernel execution time:  34.6318 ms
+spmm ryan time: 35.1535
+
+kernel execution time:  66.4663 ms
+gemm time: 67.0316
+
+kernel execution time:  6272.25 ms
+taco reference time: 6273.42
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  3.72391 ms
+fused time: 4.19698
+
+kernel execution time:  0.585647 ms
+sddmm time: 0.893112
+
+kernel execution time:  0.483056 ms
+sddmm ryan time: 0.79108
+
+kernel execution time:  0.567518 ms
+spmm ryan time: 0.808711
+
+kernel execution time:  0.929183 ms
+gemm time: 1.32543
+
+kernel execution time:  35.7066 ms
+taco reference time: 36.3331
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 64, vals: 35107264
+D1_dimension: 548551, D2_dimension: 64, vals: 35107264
+E1_dimension: 548551, E2_dimension: 64, vals: 35107264
+G1_dimension: 548551, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  94.9377 ms
+fused time: 95.7687
+
+kernel execution time:  32.2051 ms
+sddmm time: 32.7881
+
+kernel execution time:  30.3982 ms
+sddmm ryan time: 30.95
+
+kernel execution time:  34.4172 ms
+spmm ryan time: 34.9049
+
+kernel execution time:  67.2709 ms
+gemm time: 67.8035
+
+kernel execution time:  6215.08 ms
+taco reference time: 6216.26
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  6.99173 ms
+fused time: 7.86448
+
+kernel execution time:  0.78061 ms
+sddmm time: 1.28867
+
+kernel execution time:  0.554227 ms
+sddmm ryan time: 0.837111
+
+kernel execution time:  0.909912 ms
+spmm ryan time: 1.12908
+
+kernel execution time:  7.60724 ms
+gemm time: 7.85047
+
+kernel execution time:  652.888 ms
+taco reference time: 653.271
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 64, vals: 35107264
+D1_dimension: 548551, D2_dimension: 64, vals: 35107264
+E1_dimension: 548551, E2_dimension: 64, vals: 35107264
+G1_dimension: 548551, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1236.33 ms
+fused time: 1236.87
+
+kernel execution time:  249.805 ms
+sddmm time: 250.356
+
+kernel execution time:  247.195 ms
+sddmm ryan time: 247.729
+
+kernel execution time:  285.764 ms
+spmm ryan time: 286.235
+
+kernel execution time:  1529.34 ms
+gemm time: 1529.83
+
+kernel execution time:  190620 ms
+taco reference time: 190621
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1.86163 ms
+fused time: 2.34746
+
+kernel execution time:  0.542927 ms
+sddmm time: 1.05528
+
+kernel execution time:  0.541998 ms
+sddmm ryan time: 1.07672
+
+kernel execution time:  0.524767 ms
+spmm ryan time: 0.944293
+
+kernel execution time:  0.75947 ms
+gemm time: 1.2162
+
+kernel execution time:  36.3755 ms
+taco reference time: 37.0989
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1.97375 ms
+fused time: 2.84436
+
+kernel execution time:  0.881212 ms
+sddmm time: 1.38907
+
+kernel execution time:  0.545557 ms
+sddmm ryan time: 1.0807
+
+kernel execution time:  0.548488 ms
+spmm ryan time: 0.978813
+
+kernel execution time:  0.72955 ms
+gemm time: 1.2023
+
+kernel execution time:  34.867 ms
+taco reference time: 35.5819
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1.69165 ms
+fused time: 2.2114
+
+kernel execution time:  0.908102 ms
+sddmm time: 1.19792
+
+kernel execution time:  0.513137 ms
+sddmm ryan time: 0.807571
+
+kernel execution time:  0.510327 ms
+spmm ryan time: 0.76134
+
+kernel execution time:  0.803101 ms
+gemm time: 1.0684
+
+kernel execution time:  45.9784 ms
+taco reference time: 46.3901
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1.82354 ms
+fused time: 2.81223
+
+kernel execution time:  0.926052 ms
+sddmm time: 1.48292
+
+kernel execution time:  0.564157 ms
+sddmm ryan time: 1.14611
+
+kernel execution time:  0.512447 ms
+spmm ryan time: 0.925102
+
+kernel execution time:  0.689109 ms
+gemm time: 1.08196
+
+kernel execution time:  34.7847 ms
+taco reference time: 35.4182
+
+sddmm-spmm-gemm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 64, vals: 173312
+D1_dimension: 2708, D2_dimension: 64, vals: 173312
+E1_dimension: 2708, E2_dimension: 64, vals: 173312
+G1_dimension: 2708, G2_dimension: 64, vals: 4096
+
+
+kernel execution time:  6.8174 ms
+fused time: 7.69061
+
+kernel execution time:  0.935843 ms
+sddmm time: 1.46847
+
+kernel execution time:  0.612468 ms
+sddmm ryan time: 0.880662
+
+kernel execution time:  0.831351 ms
+spmm ryan time: 1.05745
+
+kernel execution time:  7.58342 ms
+gemm time: 7.82297
+
+kernel execution time:  566.881 ms
+taco reference time: 567.264
diff --git a/test/stats/sddmm-spmm.txt b/test/stats/sddmm-spmm.txt
new file mode 100644
index 000000000..df8d924b8
--- /dev/null
+++ b/test/stats/sddmm-spmm.txt
@@ -0,0 +1,5995 @@
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 102, B2_dimension: 103, vals: 3149
+C1_dimension: 102, C2_dimension: 64, vals: 6528
+D1_dimension: 103, D2_dimension: 64, vals: 6592
+E1_dimension: 103, E2_dimension: 48, vals: 4944
+
+
+kernel execution time:  6223.98 ms
+fused time: 6225.14
+
+kernel execution time:  3659.4 ms
+sddmm time: 3660.83
+
+kernel execution time:  3145.85 ms
+spmm time: 3146.77
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  17.1703 ms
+fused time: 17.6378
+
+kernel execution time:  8.23135 ms
+sddmm time: 8.77073
+
+kernel execution time:  19.3034 ms
+spmm time: 19.7426
+
+kernel execution time:  514.133 ms
+taco reference time: 514.662
+
+mtx dim1 dim2 nnz fused sddmm spmm taco-original
+bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  163.616 ms
+fused time: 164.099
+
+kernel execution time:  81.2672 ms
+sddmm time: 81.8014
+
+kernel execution time:  294.454 ms
+spmm time: 294.968
+
+kernel execution time:  5149.58 ms
+taco reference time: 5150.58
+
+mtx dim1 dim2 nnz fused sddmm spmm taco-original
+bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662
+pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  92.8319 ms
+fused time: 93.3139
+
+kernel execution time:  45.3221 ms
+sddmm time: 45.8599
+
+kernel execution time:  136.693 ms
+spmm time: 137.198
+
+kernel execution time:  2824.95 ms
+taco reference time: 2825.53
+
+mtx dim1 dim2 nnz fused sddmm spmm taco-original
+bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662
+pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58
+rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  153.867 ms
+fused time: 154.368
+
+kernel execution time:  74.9071 ms
+sddmm time: 75.4719
+
+kernel execution time:  258.678 ms
+spmm time: 259.209
+
+kernel execution time:  4786.95 ms
+taco reference time: 4788.05
+
+mtx dim1 dim2 nnz fused sddmm spmm taco-original
+bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662
+pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58
+rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95
+cant 62451 62451 4007383 153.867 74.9071 258.678 4786.95
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+
+
+kernel execution time:  231.253 ms
+fused time: 231.75
+
+kernel execution time:  112.863 ms
+sddmm time: 113.405
+
+kernel execution time:  417.749 ms
+spmm time: 418.285
+
+kernel execution time:  7133.75 ms
+taco reference time: 7134.88
+
+consph 83334 83334 6010480 231.253 112.863 417.749 7133.75
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  192.743 ms
+fused time: 193.23
+
+kernel execution time:  85.0563 ms
+sddmm time: 85.6227
+
+kernel execution time:  150.367 ms
+spmm time: 150.908
+
+kernel execution time:  3285.24 ms
+taco reference time: 3286.37
+
+cop20k_A 121192 121192 2624331 192.743 85.0563 150.367 3285.24
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+
+
+kernel execution time:  307.481 ms
+fused time: 307.98
+
+kernel execution time:  150.621 ms
+sddmm time: 151.15
+
+kernel execution time:  451.195 ms
+spmm time: 451.689
+
+kernel execution time:  9393.95 ms
+taco reference time: 9395.02
+
+shipsec1 140874 140874 7813404 307.481 150.621 451.195 9393.95
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+
+
+kernel execution time:  85.4659 ms
+fused time: 85.9614
+
+kernel execution time:  34.7139 ms
+sddmm time: 35.2946
+
+kernel execution time:  71.0646 ms
+spmm time: 71.6139
+
+kernel execution time:  1234.06 ms
+taco reference time: 1234.68
+
+scircuit 170998 170998 958936 85.4659 34.7139 71.0646 1234.06
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+
+
+kernel execution time:  88.3959 ms
+fused time: 88.8687
+
+kernel execution time:  36.7565 ms
+sddmm time: 37.3021
+
+kernel execution time:  80.2217 ms
+spmm time: 80.7621
+
+kernel execution time:  1588.94 ms
+taco reference time: 1589.58
+
+mac_econ_fwd500 206500 206500 1273389 88.3959 36.7565 80.2217 1588.94
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  244.992 ms
+fused time: 245.482
+
+kernel execution time:  86.8711 ms
+sddmm time: 87.4084
+
+kernel execution time:  245.054 ms
+spmm time: 245.552
+
+kernel execution time:  3952.47 ms
+taco reference time: 3953.57
+
+webbase-1M 1000005 1000005 3105536 244.992 86.8711 245.054 3952.47
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  3275.48 ms
+fused time: 3276.44
+
+kernel execution time:  1522.51 ms
+sddmm time: 1523.05
+
+kernel execution time:  7164.88 ms
+spmm time: 7165.87
+
+kernel execution time:  84078.7 ms
+taco reference time: 84079.8
+
+circuit5M 5558326 5558326 59524291 3275.48 1522.51 7164.88 84078.7
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  62.8847 ms
+fused time: 63.418
+
+kernel execution time:  561.815 ms
+sddmm time: 562.479
+
+kernel execution time:  62.7688 ms
+spmm time: 63.4747
+
+kernel execution time:  727.65 ms
+taco reference time: 728.755
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  1121.74 ms
+fused time: 1122.26
+
+kernel execution time:  524.494 ms
+sddmm time: 525.084
+
+kernel execution time:  602.517 ms
+spmm time: 603.056
+
+kernel execution time:  38095.2 ms
+taco reference time: 38096.3
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  1129.96 ms
+fused time: 1130.47
+
+kernel execution time:  528.571 ms
+sddmm time: 529.152
+
+kernel execution time:  611.108 ms
+spmm time: 611.643
+
+kernel execution time:  38230.1 ms
+taco reference time: 38231.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  63.6404 ms
+fused time: 64.1428
+
+kernel execution time:  562.966 ms
+sddmm time: 563.609
+
+kernel execution time:  62.5981 ms
+spmm time: 63.1044
+
+kernel execution time:  728.068 ms
+taco reference time: 729.005
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  62.7795 ms
+fused time: 63.2831
+
+kernel execution time:  564.376 ms
+sddmm time: 565.025
+
+kernel execution time:  62.8883 ms
+spmm time: 63.4116
+
+kernel execution time:  727.567 ms
+taco reference time: 728.511
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  68.4674 ms
+fused time: 68.9896
+
+kernel execution time:  563.596 ms
+sddmm time: 564.267
+
+kernel execution time:  62.5779 ms
+spmm time: 63.0812
+
+kernel execution time:  730.226 ms
+taco reference time: 731.124
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  56.5639 ms
+fused time: 57.0618
+
+kernel execution time:  562.554 ms
+sddmm time: 563.193
+
+kernel execution time:  62.6038 ms
+spmm time: 63.1209
+
+kernel execution time:  730.018 ms
+taco reference time: 730.906
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  66.7636 ms
+fused time: 67.2669
+
+kernel execution time:  564.075 ms
+sddmm time: 564.809
+
+kernel execution time:  62.9335 ms
+spmm time: 63.4347
+
+kernel execution time:  727.588 ms
+taco reference time: 728.484
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  55.1612 ms
+fused time: 55.6765
+
+kernel execution time:  574.602 ms
+sddmm time: 575.262
+
+kernel execution time:  62.2801 ms
+spmm time: 62.7918
+
+kernel execution time:  738.027 ms
+taco reference time: 738.739
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  864.868 ms
+fused time: 865.374
+
+kernel execution time:  544.426 ms
+sddmm time: 545.045
+
+kernel execution time:  377.977 ms
+spmm time: 378.522
+
+kernel execution time:  19947 ms
+taco reference time: 19948.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  71.685 ms
+fused time: 72.1905
+
+kernel execution time:  548.984 ms
+sddmm time: 549.581
+
+kernel execution time:  51.9969 ms
+spmm time: 52.562
+
+kernel execution time:  969.838 ms
+taco reference time: 970.48
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  56.1268 ms
+fused time: 56.6263
+
+kernel execution time:  566.523 ms
+sddmm time: 567.123
+
+kernel execution time:  60.4097 ms
+spmm time: 60.9402
+
+kernel execution time:  757.174 ms
+taco reference time: 757.82
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+24 threads
+
+kernel execution time:  119.302 ms
+fused time: 119.817
+
+kernel execution time:  550.24 ms
+sddmm time: 550.791
+
+kernel execution time:  49.3294 ms
+spmm time: 49.8462
+
+kernel execution time:  1710.98 ms
+taco reference time: 1711.58
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+2 threads
+
+kernel execution time:  832.831 ms
+fused time: 833.337
+
+kernel execution time:  543.518 ms
+sddmm time: 544.133
+
+kernel execution time:  372.721 ms
+spmm time: 373.277
+
+kernel execution time:  19871.7 ms
+taco reference time: 19873
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  1177.5 ms
+fused time: 1178
+
+kernel execution time:  547.532 ms
+sddmm time: 548.083
+
+kernel execution time:  618.83 ms
+spmm time: 619.38
+
+kernel execution time:  39590.7 ms
+taco reference time: 39591.8
+
+
+
+---------- 24 threads
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  18.2194 ms
+fused time: 18.6902
+
+kernel execution time:  80.3278 ms
+sddmm time: 80.7347
+
+kernel execution time:  5.17506 ms
+spmm time: 5.64137
+
+kernel execution time:  275.571 ms
+taco reference time: 275.978
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  159.53 ms
+fused time: 160.016
+
+kernel execution time:  814.453 ms
+sddmm time: 814.988
+
+kernel execution time:  41.9148 ms
+spmm time: 42.4142
+
+kernel execution time:  2782.76 ms
+taco reference time: 2783.34
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  80.1703 ms
+fused time: 80.65
+
+kernel execution time:  442.648 ms
+sddmm time: 443.191
+
+kernel execution time:  27.375 ms
+spmm time: 27.8981
+
+kernel execution time:  1518.49 ms
+taco reference time: 1519.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  147.378 ms
+fused time: 147.862
+
+kernel execution time:  746.182 ms
+sddmm time: 746.722
+
+kernel execution time:  43.521 ms
+spmm time: 44.0217
+
+kernel execution time:  2560.78 ms
+taco reference time: 2561.36
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+
+
+kernel execution time:  220.568 ms
+fused time: 221.066
+
+kernel execution time:  1121.47 ms
+sddmm time: 1122.03
+
+kernel execution time:  61.8518 ms
+spmm time: 62.3779
+
+kernel execution time:  3844.87 ms
+taco reference time: 3845.8
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  118.211 ms
+fused time: 118.715
+
+kernel execution time:  552.77 ms
+sddmm time: 553.326
+
+kernel execution time:  49.2278 ms
+spmm time: 49.7369
+
+kernel execution time:  1713.01 ms
+taco reference time: 1713.63
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+
+
+kernel execution time:  300.972 ms
+fused time: 301.471
+
+kernel execution time:  1461.86 ms
+sddmm time: 1462.45
+
+kernel execution time:  89.5313 ms
+spmm time: 90.0418
+
+kernel execution time:  5010.7 ms
+taco reference time: 5011.67
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+
+
+kernel execution time:  52.5196 ms
+fused time: 53.0296
+
+kernel execution time:  210.075 ms
+sddmm time: 210.666
+
+kernel execution time:  67.487 ms
+spmm time: 68.0293
+
+kernel execution time:  632.81 ms
+taco reference time: 633.445
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+
+
+kernel execution time:  60.3333 ms
+fused time: 60.8277
+
+kernel execution time:  261.834 ms
+sddmm time: 262.379
+
+kernel execution time:  82.326 ms
+spmm time: 82.838
+
+kernel execution time:  836.401 ms
+taco reference time: 837.023
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  187.296 ms
+fused time: 187.792
+
+kernel execution time:  616.026 ms
+sddmm time: 616.601
+
+kernel execution time:  382.801 ms
+spmm time: 383.307
+
+kernel execution time:  2082.34 ms
+taco reference time: 2082.95
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  2499.56 ms
+fused time: 2500.39
+
+kernel execution time:  11463.5 ms
+sddmm time: 11464.5
+
+kernel execution time:  2581.49 ms
+spmm time: 2582.04
+
+kernel execution time:  39683.3 ms
+taco reference time: 39684.4
+
+
+
+
+
+--------------------
+---------------------
+
+
+
+
+
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  163.669 ms
+fused time: 164.155
+
+kernel execution time:  79.1673 ms
+sddmm time: 79.7118
+
+kernel execution time:  88.6347 ms
+spmm time: 89.0784
+
+kernel execution time:  6143.97 ms
+taco reference time: 6144.94
+
+
+163.669 79.1673 88.6347 6144.94
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  17.2275 ms
+fused time: 17.6988
+
+kernel execution time:  8.26223 ms
+sddmm time: 8.8233
+
+kernel execution time:  19.3989 ms
+spmm time: 19.8422
+
+kernel execution time:  519.537 ms
+taco reference time: 520.073
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  3.03999 ms
+fused time: 3.51084
+
+kernel execution time:  8.19604 ms
+sddmm time: 8.67702
+
+kernel execution time:  5.63342 ms
+spmm time: 6.05327
+
+kernel execution time:  25.6437 ms
+taco reference time: 26.0382
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  41.03 ms
+fused time: 41.5262
+
+kernel execution time:  82.5401 ms
+sddmm time: 83.1745
+
+kernel execution time:  15.9687 ms
+spmm time: 16.5644
+
+kernel execution time:  244.774 ms
+taco reference time: 245.387
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  27.5081 ms
+fused time: 28.0034
+
+kernel execution time:  45.9865 ms
+sddmm time: 46.5649
+
+kernel execution time:  20.0912 ms
+spmm time: 20.6288
+
+kernel execution time:  138.544 ms
+taco reference time: 139.148
+
+
+----------
+-----------
+
+
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  3.25222 ms
+fused time: 3.71775
+
+kernel execution time:  8.13173 ms
+sddmm time: 8.56798
+
+kernel execution time:  5.42295 ms
+spmm time: 5.85093
+
+kernel execution time:  25.1419 ms
+taco reference time: 25.5332
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  40.046 ms
+fused time: 40.5327
+
+kernel execution time:  82.7374 ms
+sddmm time: 83.308
+
+kernel execution time:  17.148 ms
+spmm time: 17.6723
+
+kernel execution time:  244.434 ms
+taco reference time: 245.084
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  27.3917 ms
+fused time: 27.8878
+
+kernel execution time:  46.1218 ms
+sddmm time: 46.7015
+
+kernel execution time:  19.567 ms
+spmm time: 20.0877
+
+kernel execution time:  136.269 ms
+taco reference time: 136.877
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  42.3074 ms
+fused time: 42.8144
+
+kernel execution time:  75.8411 ms
+sddmm time: 76.427
+
+kernel execution time:  25.5141 ms
+spmm time: 26.0647
+
+kernel execution time:  229.9 ms
+taco reference time: 230.514
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+
+
+kernel execution time:  57.3193 ms
+fused time: 57.8292
+
+kernel execution time:  115.953 ms
+sddmm time: 116.536
+
+kernel execution time:  31.4256 ms
+spmm time: 31.9698
+
+kernel execution time:  344.97 ms
+taco reference time: 345.594
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  58.8731 ms
+fused time: 59.371
+
+kernel execution time:  96.3746 ms
+sddmm time: 96.9431
+
+kernel execution time:  52.3502 ms
+spmm time: 52.8781
+
+kernel execution time:  176.858 ms
+taco reference time: 177.482
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+
+
+kernel execution time:  97.3646 ms
+fused time: 97.869
+
+kernel execution time:  154.708 ms
+sddmm time: 155.284
+
+kernel execution time:  61.8392 ms
+spmm time: 62.3666
+
+kernel execution time:  455.127 ms
+taco reference time: 455.719
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+
+
+kernel execution time:  30.2488 ms
+fused time: 30.744
+
+kernel execution time:  39.9852 ms
+sddmm time: 40.5654
+
+kernel execution time:  67.5062 ms
+spmm time: 68.0413
+
+kernel execution time:  74.4023 ms
+taco reference time: 75.0271
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+
+
+kernel execution time:  34.9737 ms
+fused time: 35.4724
+
+kernel execution time:  39.6662 ms
+sddmm time: 40.2179
+
+kernel execution time:  82.4413 ms
+spmm time: 82.9627
+
+kernel execution time:  91.1415 ms
+taco reference time: 91.8035
+
+sddmm-spmm execution
+
+-----------------------------------------
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  118.92 ms
+fused time: 119.4
+
+kernel execution time:  90.6065 ms
+sddmm time: 91.1522
+
+kernel execution time:  390.342 ms
+spmm time: 390.863
+
+kernel execution time:  423.16 ms
+taco reference time: 423.757
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  1158.96 ms
+fused time: 1159.93
+
+kernel execution time:  1561.31 ms
+sddmm time: 1561.87
+
+kernel execution time:  2533.87 ms
+spmm time: 2534.43
+
+kernel execution time:  6529.81 ms
+taco reference time: 6530.95
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  3.12799 ms
+fused time: 3.5888
+
+kernel execution time:  8.20063 ms
+sddmm time: 8.64883
+
+kernel execution time:  5.23889 ms
+spmm time: 5.67244
+
+kernel execution time:  25.0758 ms
+taco reference time: 25.4671
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  39.3104 ms
+fused time: 39.7945
+
+kernel execution time:  82.5126 ms
+sddmm time: 83.0785
+
+kernel execution time:  15.6324 ms
+spmm time: 16.1739
+
+kernel execution time:  245.768 ms
+taco reference time: 246.406
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 102, B2_dimension: 103, vals: 3149
+C1_dimension: 102, C2_dimension: 64, vals: 6528
+D1_dimension: 103, D2_dimension: 64, vals: 6592
+E1_dimension: 103, E2_dimension: 64, vals: 6592
+
+
+kernel execution time:  0.160132 ms
+fused time: 0.567098
+
+kernel execution time:  0.065981 ms
+sddmm time: 0.853092
+
+kernel execution time:  0.081641 ms
+spmm time: 0.331655
+
+kernel execution time:  0.336385 ms
+taco reference time: 1.05356
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.0165 ms
+fused time: 0.78845
+
+kernel execution time:  0.011641 ms
+sddmm time: 0.873231
+
+kernel execution time:  0.011011 ms
+spmm time: 0.486977
+
+kernel execution time:  0.059631 ms
+taco reference time: 0.958413
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.01989 ms
+fused time: 0.813381
+
+kernel execution time:  0.01392 ms
+sddmm time: 0.976913
+
+kernel execution time:  0.013151 ms
+spmm time: 0.497287
+
+kernel execution time:  0.058 ms
+taco reference time: 0.974083
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.0192 ms
+fused time: 0.8019
+
+kernel execution time:  0.012991 ms
+sddmm time: 0.990253
+
+kernel execution time:  0.01291 ms
+spmm time: 0.490396
+
+kernel execution time:  0.057891 ms
+taco reference time: 0.929332
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.01797 ms
+fused time: 0.779061
+
+kernel execution time:  0.013 ms
+sddmm time: 0.7717
+
+kernel execution time:  0.01429 ms
+spmm time: 0.487296
+
+kernel execution time:  0.05764 ms
+taco reference time: 0.72862
+
+sddmm-spmm execution
+
+
+
+sddmm with parallel execution
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.02088 ms
+fused time: 0.912153
+
+kernel execution time:  0.01161 ms
+sddmm time: 0.944402
+
+kernel execution time:  0.01292 ms
+spmm time: 0.562267
+
+kernel execution time:  0.067781 ms
+taco reference time: 1.10908
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  166.429 ms
+fused time: 166.938
+
+kernel execution time:  83.0174 ms
+sddmm time: 83.5946
+
+kernel execution time:  303.7 ms
+spmm time: 304.246
+
+kernel execution time:  5227.75 ms
+taco reference time: 5228.77
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  166.755 ms
+fused time: 167.262
+
+kernel execution time:  83.1762 ms
+sddmm time: 83.7333
+
+kernel execution time:  303.525 ms
+spmm time: 304.051
+
+kernel execution time:  5232.78 ms
+taco reference time: 5233.91
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  27.2912 ms
+fused time: 27.7968
+
+kernel execution time:  84.1751 ms
+sddmm time: 84.7569
+
+kernel execution time:  12.6781 ms
+spmm time: 13.1881
+
+kernel execution time:  134.209 ms
+taco reference time: 134.846
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.6207 ms
+fused time: 27.1299
+
+kernel execution time:  86.3046 ms
+sddmm time: 86.9394
+
+kernel execution time:  12.7749 ms
+spmm time: 13.2807
+
+kernel execution time:  130.582 ms
+taco reference time: 131.278
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  101.848 ms
+fused time: 102.362
+
+kernel execution time:  83.9029 ms
+sddmm time: 84.4969
+
+kernel execution time:  42.5674 ms
+spmm time: 43.1242
+
+kernel execution time:  708.807 ms
+taco reference time: 709.518
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  107.29 ms
+fused time: 107.797
+
+kernel execution time:  83.8499 ms
+sddmm time: 84.3953
+
+kernel execution time:  43.5065 ms
+spmm time: 44.0135
+
+kernel execution time:  705.909 ms
+taco reference time: 706.511
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.2026 ms
+fused time: 26.7322
+
+kernel execution time:  86.809 ms
+sddmm time: 87.4374
+
+kernel execution time:  12.6681 ms
+spmm time: 13.1758
+
+kernel execution time:  130.015 ms
+taco reference time: 130.717
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  4.9002 ms
+fused time: 5.40296
+
+kernel execution time:  9.21483 ms
+sddmm time: 9.69115
+
+kernel execution time:  5.35955 ms
+spmm time: 5.79675
+
+kernel execution time:  14.9148 ms
+taco reference time: 15.4012
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.39607 ms
+fused time: 2.86927
+
+kernel execution time:  8.62899 ms
+sddmm time: 8.97544
+
+kernel execution time:  5.41841 ms
+spmm time: 5.83089
+
+kernel execution time:  14.2058 ms
+taco reference time: 14.5956
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.85339 ms
+fused time: 2.66762
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  4.94195 ms
+fused time: 6.0647
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  5.09918 ms
+fused time: 6.23075
+
+kernel execution time:  14.2105 ms
+sddmm time: 15.026
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  4.93573 ms
+fused time: 5.42636
+
+kernel execution time:  8.35333 ms
+sddmm time: 8.77215
+
+kernel execution time:  5.35189 ms
+spmm time: 5.7874
+
+kernel execution time:  15.4744 ms
+taco reference time: 15.8619
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.72938 ms
+fused time: 2.19226
+
+kernel execution time:  8.38474 ms
+sddmm time: 8.70208
+
+kernel execution time:  5.55896 ms
+spmm time: 5.96847
+
+kernel execution time:  13.8271 ms
+taco reference time: 14.2228
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.99224 ms
+fused time: 2.45758
+
+kernel execution time:  8.4613 ms
+sddmm time: 8.79168
+
+kernel execution time:  5.51595 ms
+spmm time: 5.95761
+
+kernel execution time:  13.5919 ms
+taco reference time: 13.973
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.17974 ms
+fused time: 2.64915
+
+kernel execution time:  9.49553 ms
+sddmm time: 9.89178
+
+kernel execution time:  5.3851 ms
+spmm time: 5.80552
+
+kernel execution time:  15.1854 ms
+taco reference time: 15.6294
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.77985 ms
+fused time: 2.24554
+
+kernel execution time:  9.31643 ms
+sddmm time: 9.66639
+
+kernel execution time:  5.48351 ms
+spmm time: 5.89775
+
+kernel execution time:  15.1635 ms
+taco reference time: 15.6173
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.09062 ms
+fused time: 2.75986
+
+kernel execution time:  8.53961 ms
+sddmm time: 8.99868
+
+kernel execution time:  5.43386 ms
+spmm time: 5.86914
+
+kernel execution time:  14.7848 ms
+taco reference time: 15.2128
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.99345 ms
+fused time: 2.4639
+
+kernel execution time:  10.0509 ms
+sddmm time: 10.4945
+
+kernel execution time:  5.37643 ms
+spmm time: 5.82607
+
+kernel execution time:  15.0911 ms
+taco reference time: 15.5753
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.14705 ms
+fused time: 2.62359
+
+kernel execution time:  9.35781 ms
+sddmm time: 9.71116
+
+kernel execution time:  6.0153 ms
+spmm time: 6.42121
+
+kernel execution time:  14.8814 ms
+taco reference time: 15.3035
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  3.85621 ms
+fused time: 4.31728
+
+kernel execution time:  8.49591 ms
+sddmm time: 8.85325
+
+kernel execution time:  4.55458 ms
+spmm time: 5.00309
+
+kernel execution time:  71.693 ms
+taco reference time: 72.1249
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  4.4083 ms
+fused time: 4.87449
+
+kernel execution time:  9.23609 ms
+sddmm time: 9.68592
+
+kernel execution time:  4.52337 ms
+spmm time: 4.93316
+
+kernel execution time:  75.7983 ms
+taco reference time: 76.2419
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.02675 ms
+fused time: 2.47188
+
+kernel execution time:  9.25498 ms
+sddmm time: 9.67129
+
+kernel execution time:  5.23325 ms
+spmm time: 5.68302
+
+kernel execution time:  14.8775 ms
+taco reference time: 15.3813
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.94846 ms
+fused time: 2.40322
+
+kernel execution time:  9.52502 ms
+sddmm time: 9.90909
+
+kernel execution time:  5.31443 ms
+spmm time: 5.71988
+
+kernel execution time:  15.7004 ms
+taco reference time: 16.1456
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.79798 ms
+fused time: 2.25022
+
+kernel execution time:  9.43793 ms
+sddmm time: 9.82708
+
+kernel execution time:  5.29275 ms
+spmm time: 5.69457
+
+kernel execution time:  14.9269 ms
+taco reference time: 15.3874
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.75935 ms
+fused time: 2.20095
+
+kernel execution time:  8.58506 ms
+sddmm time: 8.92534
+
+kernel execution time:  5.5533 ms
+spmm time: 5.93899
+
+kernel execution time:  14.2327 ms
+taco reference time: 14.5943
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.04599 ms
+fused time: 2.50059
+
+kernel execution time:  9.39166 ms
+sddmm time: 9.80431
+
+kernel execution time:  5.3514 ms
+spmm time: 5.75487
+
+kernel execution time:  15.0619 ms
+taco reference time: 15.497
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  1.9781 ms
+fused time: 2.41055
+
+kernel execution time:  8.50024 ms
+sddmm time: 8.81933
+
+kernel execution time:  5.28711 ms
+spmm time: 5.68452
+
+kernel execution time:  13.5108 ms
+taco reference time: 13.8766
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  11.5205 ms
+fused time: 12.2496
+
+kernel execution time:  0.00954 ms
+sddmm time: 0.935822
+
+kernel execution time:  0.02342 ms
+spmm time: 0.324625
+
+kernel execution time:  0.050091 ms
+taco reference time: 0.727519
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.235743 ms
+fused time: 0.969273
+
+kernel execution time:  0.01214 ms
+sddmm time: 0.981613
+
+kernel execution time:  0.03193 ms
+spmm time: 0.521637
+
+kernel execution time:  0.059391 ms
+taco reference time: 0.945792
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.235003 ms
+fused time: 0.964663
+
+kernel execution time:  0.013771 ms
+sddmm time: 1.23201
+
+kernel execution time:  0.027521 ms
+spmm time: 0.470876
+
+kernel execution time:  0.043441 ms
+taco reference time: 0.814271
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.242774 ms
+fused time: 0.984063
+
+kernel execution time:  0.01744 ms
+sddmm time: 1.07782
+
+kernel execution time:  0.03915 ms
+spmm time: 0.602928
+
+kernel execution time:  0.073381 ms
+taco reference time: 0.858301
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.199533 ms
+fused time: 0.604928
+
+kernel execution time:  0.00675 ms
+sddmm time: 0.983573
+
+kernel execution time:  0.02448 ms
+spmm time: 0.300224
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.192703 ms
+fused time: 0.575667
+
+kernel execution time:  0.00622 ms
+sddmm time: 0.863292
+
+kernel execution time:  0.0221 ms
+spmm time: 0.270204
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.195482 ms
+fused time: 0.580768
+
+kernel execution time:  0.00652 ms
+sddmm time: 0.957703
+
+kernel execution time:  0.025451 ms
+spmm time: 0.313074
+
+kernel execution time:  0.085611 ms
+taco reference time: 0.970753
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.00856 ms
+fused time: 2.45147
+
+kernel execution time:  8.5121 ms
+sddmm time: 8.95565
+
+kernel execution time:  5.46083 ms
+spmm time: 5.93676
+
+kernel execution time:  14.1411 ms
+taco reference time: 14.7397
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  9.91597 ms
+fused time: 10.4166
+
+kernel execution time:  85.127 ms
+sddmm time: 85.7297
+
+kernel execution time:  12.8101 ms
+spmm time: 13.3194
+
+kernel execution time:  129.721 ms
+taco reference time: 130.362
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  9.9746 ms
+fused time: 10.4536
+
+kernel execution time:  85.6921 ms
+sddmm time: 86.3192
+
+kernel execution time:  12.752 ms
+spmm time: 13.2448
+
+kernel execution time:  135.682 ms
+taco reference time: 136.351
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  10.0998 ms
+fused time: 10.5872
+
+kernel execution time:  85.0064 ms
+sddmm time: 85.6385
+
+kernel execution time:  12.6128 ms
+spmm time: 13.1169
+
+kernel execution time:  134.629 ms
+taco reference time: 135.323
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  10.1006 ms
+fused time: 10.5902
+
+kernel execution time:  88.2603 ms
+sddmm time: 88.897
+
+kernel execution time:  12.5197 ms
+spmm time: 13.0137
+
+kernel execution time:  130.3 ms
+taco reference time: 130.977
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  27.6596 ms
+fused time: 28.2096
+
+kernel execution time:  85.6018 ms
+sddmm time: 86.213
+
+kernel execution time:  12.8244 ms
+spmm time: 13.3343
+
+kernel execution time:  131.089 ms
+taco reference time: 131.789
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.582 ms
+fused time: 27.0673
+
+kernel execution time:  87.6048 ms
+sddmm time: 88.2462
+
+kernel execution time:  12.5643 ms
+spmm time: 13.0723
+
+kernel execution time:  130.366 ms
+taco reference time: 131.043
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.5615 ms
+fused time: 27.0713
+
+kernel execution time:  87.5473 ms
+sddmm time: 88.1848
+
+kernel execution time:  12.6726 ms
+spmm time: 13.152
+
+kernel execution time:  131.024 ms
+taco reference time: 131.701
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.3835 ms
+fused time: 26.8768
+
+kernel execution time:  84.7609 ms
+sddmm time: 85.3584
+
+kernel execution time:  12.8437 ms
+spmm time: 13.346
+
+kernel execution time:  132.548 ms
+taco reference time: 133.168
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.6808 ms
+fused time: 27.1679
+
+kernel execution time:  87.0948 ms
+sddmm time: 87.7219
+
+kernel execution time:  12.695 ms
+spmm time: 13.1923
+
+kernel execution time:  134.587 ms
+taco reference time: 135.255
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 5, E2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.235254 ms
+fused time: 1.04843
+
+kernel execution time:  0.01102 ms
+sddmm time: 0.989634
+
+kernel execution time:  0.028701 ms
+spmm time: 0.574108
+
+kernel execution time:  0.04363 ms
+taco reference time: 0.840431
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  4.9177 ms
+fused time: 5.37305
+
+kernel execution time:  8.31608 ms
+sddmm time: 8.76144
+
+kernel execution time:  5.43042 ms
+spmm time: 5.82157
+
+kernel execution time:  15.0881 ms
+taco reference time: 15.4618
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  175.005 ms
+fused time: 175.507
+
+kernel execution time:  83.4127 ms
+sddmm time: 83.9734
+
+kernel execution time:  14.3027 ms
+spmm time: 14.8133
+
+kernel execution time:  5196.98 ms
+taco reference time: 5198.39
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  96.7809 ms
+fused time: 97.2629
+
+kernel execution time:  46.666 ms
+sddmm time: 47.229
+
+kernel execution time:  23.9017 ms
+spmm time: 24.4045
+
+kernel execution time:  2871.87 ms
+taco reference time: 2872.47
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  98.4225 ms
+fused time: 98.9062
+
+kernel execution time:  46.8647 ms
+sddmm time: 47.4013
+
+kernel execution time:  22.9253 ms
+spmm time: 23.4505
+
+kernel execution time:  2873.94 ms
+taco reference time: 2874.59
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  174.126 ms
+fused time: 174.616
+
+kernel execution time:  83.7673 ms
+sddmm time: 84.3199
+
+kernel execution time:  13.0437 ms
+spmm time: 13.5625
+
+kernel execution time:  5227.23 ms
+taco reference time: 5228.25
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  27.6542 ms
+fused time: 28.1392
+
+kernel execution time:  85.8985 ms
+sddmm time: 86.5293
+
+kernel execution time:  12.6722 ms
+spmm time: 13.1883
+
+kernel execution time:  130.948 ms
+taco reference time: 131.642
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  102.4 ms
+fused time: 102.884
+
+kernel execution time:  83.5498 ms
+sddmm time: 84.1386
+
+kernel execution time:  42.5049 ms
+spmm time: 43.0426
+
+kernel execution time:  710.168 ms
+taco reference time: 710.765
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  43.9551 ms
+fused time: 44.6972
+
+kernel execution time:  87.6996 ms
+sddmm time: 89.4613
+
+kernel execution time:  18.2632 ms
+spmm time: 18.7804
+
+kernel execution time:  122.262 ms
+taco reference time: 123.152
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  47.9407 ms
+fused time: 48.4339
+
+kernel execution time:  89.2157 ms
+sddmm time: 89.8924
+
+kernel execution time:  18.2009 ms
+spmm time: 18.7261
+
+kernel execution time:  123.559 ms
+taco reference time: 124.405
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  43.2059 ms
+fused time: 43.6957
+
+kernel execution time:  90.4258 ms
+sddmm time: 91.1259
+
+kernel execution time:  18.2655 ms
+spmm time: 18.7701
+
+kernel execution time:  123.565 ms
+taco reference time: 124.302
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  48.4004 ms
+fused time: 48.9337
+
+kernel execution time:  85.0973 ms
+sddmm time: 85.6769
+
+kernel execution time:  18.1666 ms
+spmm time: 18.6607
+
+kernel execution time:  123.347 ms
+taco reference time: 124.257
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  25.3405 ms
+fused time: 25.8282
+
+kernel execution time:  87.1326 ms
+sddmm time: 87.7761
+
+kernel execution time:  12.9441 ms
+spmm time: 13.4425
+
+kernel execution time:  132.388 ms
+taco reference time: 133.056
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  26.5881 ms
+fused time: 27.0669
+
+kernel execution time:  85.9749 ms
+sddmm time: 86.5764
+
+kernel execution time:  12.5752 ms
+spmm time: 13.1009
+
+kernel execution time:  131.368 ms
+taco reference time: 132.072
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  177.141 ms
+fused time: 177.635
+
+kernel execution time:  83.6231 ms
+sddmm time: 84.2074
+
+kernel execution time:  303.927 ms
+spmm time: 304.455
+
+kernel execution time:  5553.72 ms
+taco reference time: 5554.89
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  177.24 ms
+fused time: 177.718
+
+kernel execution time:  83.5235 ms
+sddmm time: 84.0624
+
+kernel execution time:  299.135 ms
+spmm time: 299.642
+
+kernel execution time:  5568.94 ms
+taco reference time: 5570.07
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  177.334 ms
+fused time: 177.831
+
+kernel execution time:  83.7814 ms
+sddmm time: 84.3619
+
+kernel execution time:  302.13 ms
+spmm time: 302.653
+
+kernel execution time:  5535.64 ms
+taco reference time: 5536.87
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  180.923 ms
+fused time: 181.39
+
+kernel execution time:  88.0592 ms
+sddmm time: 88.6258
+
+kernel execution time:  300.533 ms
+spmm time: 301.047
+
+kernel execution time:  5549.25 ms
+taco reference time: 5550.45
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  27.7589 ms
+fused time: 28.2424
+
+kernel execution time:  87.4027 ms
+sddmm time: 88.0292
+
+kernel execution time:  13.0621 ms
+spmm time: 13.5896
+
+kernel execution time:  131.501 ms
+taco reference time: 132.191
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  27.1159 ms
+fused time: 27.6123
+
+kernel execution time:  88.1805 ms
+sddmm time: 88.8475
+
+kernel execution time:  13.2301 ms
+spmm time: 13.7512
+
+kernel execution time:  130.96 ms
+taco reference time: 131.633
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  11.1791 ms
+fused time: 11.6596
+
+kernel execution time:  324.829 ms
+sddmm time: 325.459
+
+kernel execution time:  5.82413 ms
+spmm time: 6.613
+
+kernel execution time:  162.505 ms
+taco reference time: 163.319
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  167.093 ms
+fused time: 167.577
+
+kernel execution time:  264.158 ms
+sddmm time: 264.712
+
+kernel execution time:  68.6915 ms
+spmm time: 69.2406
+
+kernel execution time:  5581.71 ms
+taco reference time: 5582.83
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  170.702 ms
+fused time: 171.176
+
+kernel execution time:  88.5905 ms
+sddmm time: 89.1447
+
+kernel execution time:  68.5964 ms
+spmm time: 69.1031
+
+kernel execution time:  5551.85 ms
+taco reference time: 5552.97
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  10.8645 ms
+fused time: 11.3531
+
+kernel execution time:  9.04029 ms
+sddmm time: 9.79108
+
+kernel execution time:  5.63795 ms
+spmm time: 6.23454
+
+kernel execution time:  131.822 ms
+taco reference time: 132.52
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  9.65163 ms
+fused time: 10.1436
+
+kernel execution time:  9.70327 ms
+sddmm time: 10.2929
+
+kernel execution time:  4.85235 ms
+spmm time: 5.40286
+
+kernel execution time:  74.2349 ms
+taco reference time: 74.8374
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  15.2637 ms
+fused time: 15.7881
+
+kernel execution time:  12.0484 ms
+sddmm time: 12.7139
+
+kernel execution time:  7.9269 ms
+spmm time: 8.5266
+
+kernel execution time:  122.713 ms
+taco reference time: 123.431
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  750.953 ms
+fused time: 751.849
+
+kernel execution time:  410.668 ms
+sddmm time: 411.252
+
+kernel execution time:  490.401 ms
+spmm time: 490.993
+
+kernel execution time:  7382.94 ms
+taco reference time: 7384.02
+
+
+
+--------------------------------
+
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  2.89124 ms
+fused time: 3.33064
+
+kernel execution time:  2.48885 ms
+sddmm time: 2.80581
+
+kernel execution time:  1.25714 ms
+sddmm time: 1.58645
+
+kernel execution time:  1.82611 ms
+spmm time: 2.10693
+
+kernel execution time:  14.7536 ms
+taco reference time: 15.1553
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  10.4526 ms
+fused time: 10.9812
+
+kernel execution time:  9.28251 ms
+sddmm time: 9.93109
+
+kernel execution time:  5.36035 ms
+sddmm time: 5.99358
+
+kernel execution time:  5.29728 ms
+spmm time: 5.86825
+
+kernel execution time:  132.268 ms
+taco reference time: 132.952
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  9.78667 ms
+fused time: 10.2677
+
+kernel execution time:  9.62847 ms
+sddmm time: 10.2355
+
+kernel execution time:  3.92285 ms
+sddmm time: 4.52461
+
+kernel execution time:  4.91246 ms
+spmm time: 5.38467
+
+kernel execution time:  74.8226 ms
+taco reference time: 75.4131
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+
+
+kernel execution time:  19.7265 ms
+fused time: 20.2664
+
+kernel execution time:  17.1571 ms
+sddmm time: 17.8366
+
+kernel execution time:  10.5179 ms
+sddmm time: 11.1615
+
+kernel execution time:  10.7719 ms
+spmm time: 11.4141
+
+kernel execution time:  186.633 ms
+taco reference time: 187.406
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  28.3142 ms
+fused time: 28.8151
+
+kernel execution time:  20.3455 ms
+sddmm time: 21.0059
+
+kernel execution time:  12.2316 ms
+sddmm time: 12.8542
+
+kernel execution time:  13.8246 ms
+spmm time: 14.4268
+
+kernel execution time:  100.583 ms
+taco reference time: 101.304
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+
+
+kernel execution time:  20.038 ms
+fused time: 20.555
+
+kernel execution time:  11.3385 ms
+sddmm time: 11.9822
+
+kernel execution time:  8.08082 ms
+sddmm time: 8.71341
+
+kernel execution time:  10.9562 ms
+spmm time: 11.5782
+
+kernel execution time:  80.9289 ms
+taco reference time: 81.6333
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+
+
+kernel execution time:  25.3126 ms
+fused time: 25.8254
+
+kernel execution time:  15.9278 ms
+sddmm time: 16.6406
+
+kernel execution time:  10.5087 ms
+sddmm time: 11.2503
+
+kernel execution time:  14.3281 ms
+spmm time: 14.9822
+
+kernel execution time:  98.03 ms
+taco reference time: 98.7014
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  77.5645 ms
+fused time: 78.0892
+
+kernel execution time:  31.7247 ms
+sddmm time: 32.4147
+
+kernel execution time:  26.0367 ms
+sddmm time: 26.7311
+
+kernel execution time:  47.1564 ms
+spmm time: 47.8767
+
+kernel execution time:  444.658 ms
+taco reference time: 445.356
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  760.552 ms
+fused time: 761.497
+
+kernel execution time:  414.806 ms
+sddmm time: 415.511
+
+kernel execution time:  347.288 ms
+sddmm time: 348.046
+
+kernel execution time:  493.652 ms
+spmm time: 494.215
+
+kernel execution time:  7069.3 ms
+taco reference time: 7070.64
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  14.868 ms
+fused time: 15.3593
+
+kernel execution time:  12.1237 ms
+sddmm time: 12.798
+
+kernel execution time:  7.68559 ms
+sddmm time: 8.34388
+
+kernel execution time:  7.93647 ms
+spmm time: 8.56812
+
+kernel execution time:  122.125 ms
+taco reference time: 122.846
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+
+
+kernel execution time:  28.6635 ms
+fused time: 29.1538
+
+kernel execution time:  24.0642 ms
+sddmm time: 24.694
+
+kernel execution time:  15.2 ms
+sddmm time: 15.875
+
+kernel execution time:  16.0406 ms
+spmm time: 16.6827
+
+kernel execution time:  242.63 ms
+taco reference time: 243.336
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 140874, E2_dimension: 128, vals: 18031872
+
+
+kernel execution time:  50.9773 ms
+fused time: 51.4656
+
+kernel execution time:  42.0404 ms
+sddmm time: 42.7352
+
+kernel execution time:  24.4547 ms
+sddmm time: 25.1418
+
+kernel execution time:  28.4623 ms
+spmm time: 29.1722
+
+kernel execution time:  903.853 ms
+taco reference time: 904.701
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 250, vals: 35218500
+D1_dimension: 140874, D2_dimension: 250, vals: 35218500
+E1_dimension: 140874, E2_dimension: 250, vals: 35218500
+
+
+kernel execution time:  97.1385 ms
+fused time: 97.6193
+
+kernel execution time:  87.9795 ms
+sddmm time: 88.6535
+
+kernel execution time:  41.8878 ms
+sddmm time: 42.5463
+
+kernel execution time:  54.1433 ms
+spmm time: 54.7894
+
+kernel execution time:  3669.52 ms
+taco reference time: 3670.78
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 512, vals: 72127488
+D1_dimension: 140874, D2_dimension: 512, vals: 72127488
+E1_dimension: 140874, E2_dimension: 512, vals: 72127488
+
+
+kernel execution time:  200.849 ms
+fused time: 201.329
+
+kernel execution time:  208.737 ms
+sddmm time: 209.393
+
+kernel execution time:  81.0923 ms
+sddmm time: 81.7181
+
+kernel execution time:  106.669 ms
+spmm time: 107.272
+
+kernel execution time:  15631.7 ms
+taco reference time: 15632.4
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 32, vals: 4507968
+D1_dimension: 140874, D2_dimension: 32, vals: 4507968
+E1_dimension: 140874, E2_dimension: 32, vals: 4507968
+
+
+kernel execution time:  16.5631 ms
+fused time: 17.0602
+
+kernel execution time:  15.2542 ms
+sddmm time: 15.8919
+
+kernel execution time:  9.9104 ms
+sddmm time: 10.5671
+
+kernel execution time:  9.61101 ms
+spmm time: 10.2251
+
+kernel execution time:  68.1735 ms
+taco reference time: 68.8921
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 256, vals: 36063744
+D1_dimension: 140874, D2_dimension: 256, vals: 36063744
+E1_dimension: 140874, E2_dimension: 256, vals: 36063744
+
+
+kernel execution time:  98.882 ms
+fused time: 99.3547
+
+kernel execution time:  90.4755 ms
+sddmm time: 91.136
+
+kernel execution time:  42.7487 ms
+sddmm time: 43.4726
+
+kernel execution time:  55.0127 ms
+spmm time: 55.731
+
+kernel execution time:  3836.15 ms
+taco reference time: 3837.42
+
+
+
+
+
+--------- single threads
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+E1_dimension: 10974, E2_dimension: 64, vals: 702336
+
+
+kernel execution time:  22.3045 ms
+fused time: 22.7793
+
+kernel execution time:  8.91826 ms
+sddmm time: 9.46409
+
+kernel execution time:  9.62695 ms
+sddmm time: 10.1105
+
+kernel execution time:  10.8309 ms
+spmm time: 11.2862
+
+kernel execution time:  554.747 ms
+taco reference time: 555.315
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 36417, E2_dimension: 64, vals: 2330688
+
+
+kernel execution time:  166.569 ms
+fused time: 167.058
+
+kernel execution time:  83.9979 ms
+sddmm time: 84.5309
+
+kernel execution time:  88.9971 ms
+sddmm time: 89.5559
+
+kernel execution time:  68.5334 ms
+spmm time: 69.0587
+
+kernel execution time:  5562.04 ms
+taco reference time: 5563.12
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 46835, D2_dimension: 64, vals: 2997440
+E1_dimension: 46835, E2_dimension: 64, vals: 2997440
+
+
+kernel execution time:  94.7764 ms
+fused time: 95.2526
+
+kernel execution time:  47.3174 ms
+sddmm time: 47.8674
+
+kernel execution time:  49.7766 ms
+sddmm time: 50.3372
+
+kernel execution time:  51.3685 ms
+spmm time: 51.8719
+
+kernel execution time:  3073.44 ms
+taco reference time: 3074.55
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  158.175 ms
+fused time: 158.637
+
+kernel execution time:  78.3163 ms
+sddmm time: 78.8675
+
+kernel execution time:  82.3237 ms
+sddmm time: 82.8606
+
+kernel execution time:  76.2056 ms
+spmm time: 76.7067
+
+kernel execution time:  5178.46 ms
+taco reference time: 5179.53
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 83334, D2_dimension: 64, vals: 5333376
+E1_dimension: 83334, E2_dimension: 64, vals: 5333376
+
+
+kernel execution time:  241.194 ms
+fused time: 241.676
+
+kernel execution time:  117.775 ms
+sddmm time: 118.325
+
+kernel execution time:  124.006 ms
+sddmm time: 124.563
+
+kernel execution time:  117.052 ms
+spmm time: 117.594
+
+kernel execution time:  7844.57 ms
+taco reference time: 7845.69
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+E1_dimension: 121192, E2_dimension: 64, vals: 7756288
+
+
+kernel execution time:  201.49 ms
+fused time: 201.973
+
+kernel execution time:  90.6759 ms
+sddmm time: 91.2506
+
+kernel execution time:  93.0462 ms
+sddmm time: 93.6053
+
+kernel execution time:  119.005 ms
+spmm time: 119.547
+
+kernel execution time:  3567.55 ms
+taco reference time: 3568.67
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 140874, D2_dimension: 64, vals: 9015936
+E1_dimension: 140874, E2_dimension: 64, vals: 9015936
+
+
+kernel execution time:  315.238 ms
+fused time: 315.723
+
+kernel execution time:  156.048 ms
+sddmm time: 156.588
+
+kernel execution time:  164.148 ms
+sddmm time: 164.747
+
+kernel execution time:  162.502 ms
+spmm time: 163.021
+
+kernel execution time:  10131.2 ms
+taco reference time: 10132.3
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 170998, D2_dimension: 64, vals: 10943872
+E1_dimension: 170998, E2_dimension: 64, vals: 10943872
+
+
+kernel execution time:  87.9511 ms
+fused time: 88.4267
+
+kernel execution time:  37.6228 ms
+sddmm time: 38.1792
+
+kernel execution time:  37.8418 ms
+sddmm time: 38.3903
+
+kernel execution time:  84.4997 ms
+spmm time: 85.037
+
+kernel execution time:  1330.01 ms
+taco reference time: 1330.63
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 206500, D2_dimension: 64, vals: 13216000
+E1_dimension: 206500, E2_dimension: 64, vals: 13216000
+
+
+kernel execution time:  92.8914 ms
+fused time: 93.3697
+
+kernel execution time:  39.7714 ms
+sddmm time: 40.3051
+
+kernel execution time:  40.1835 ms
+sddmm time: 40.7458
+
+kernel execution time:  98.0818 ms
+spmm time: 98.5997
+
+kernel execution time:  1721.01 ms
+taco reference time: 1721.64
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  259.845 ms
+fused time: 260.329
+
+kernel execution time:  95.8311 ms
+sddmm time: 96.3809
+
+kernel execution time:  97.6925 ms
+sddmm time: 98.2397
+
+kernel execution time:  292.415 ms
+spmm time: 292.952
+
+kernel execution time:  4292.03 ms
+taco reference time: 4293.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  3326.66 ms
+fused time: 3327.64
+
+kernel execution time:  1617.82 ms
+sddmm time: 1618.36
+
+kernel execution time:  1672.73 ms
+sddmm time: 1673.27
+
+kernel execution time:  3199.32 ms
+spmm time: 3200.35
+
+kernel execution time:  88682 ms
+taco reference time: 88683.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  722.484 ms
+fused time: 723.506
+
+kernel execution time:  613.844 ms
+sddmm time: 614.401
+
+kernel execution time:  331.43 ms
+sddmm time: 331.978
+
+kernel execution time:  463.752 ms
+spmm time: 464.328
+
+kernel execution time:  8864.13 ms
+taco reference time: 8865.18
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 16, vals: 2253984
+D1_dimension: 140874, D2_dimension: 16, vals: 2253984
+E1_dimension: 140874, E2_dimension: 16, vals: 2253984
+
+
+kernel execution time:  10.0607 ms
+fused time: 10.5457
+
+kernel execution time:  8.70278 ms
+sddmm time: 9.26539
+
+kernel execution time:  6.88021 ms
+sddmm time: 7.49853
+
+kernel execution time:  5.91127 ms
+spmm time: 6.50028
+
+kernel execution time:  23.776 ms
+taco reference time: 24.3947
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  179.752 ms
+fused time: 180.214
+
+kernel execution time:  170.678 ms
+sddmm time: 171.224
+
+kernel execution time:  67.5166 ms
+sddmm time: 68.0688
+
+kernel execution time:  168.557 ms
+spmm time: 169.083
+
+kernel execution time:  2452.7 ms
+taco reference time: 2453.34
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  111.508 ms
+fused time: 111.983
+
+kernel execution time:  171.316 ms
+sddmm time: 171.863
+
+kernel execution time:  40.3219 ms
+sddmm time: 40.8676
+
+kernel execution time:  91.8855 ms
+spmm time: 92.3888
+
+kernel execution time:  1349.98 ms
+taco reference time: 1350.57
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  84.4185 ms
+fused time: 84.8803
+
+kernel execution time:  131.898 ms
+sddmm time: 132.465
+
+kernel execution time:  27.6062 ms
+sddmm time: 28.2117
+
+kernel execution time:  59.0816 ms
+spmm time: 59.6189
+
+kernel execution time:  731.805 ms
+taco reference time: 732.441
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  76.4489 ms
+fused time: 76.9087
+
+kernel execution time:  65.9875 ms
+sddmm time: 66.5522
+
+kernel execution time:  25.2905 ms
+sddmm time: 25.8759
+
+kernel execution time:  50.1563 ms
+spmm time: 50.6842
+
+kernel execution time:  397.479 ms
+taco reference time: 398.109
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 1000005, D2_dimension: 64, vals: 64000320
+E1_dimension: 1000005, E2_dimension: 64, vals: 64000320
+
+
+kernel execution time:  74.0227 ms
+fused time: 74.5259
+
+kernel execution time:  40.2983 ms
+sddmm time: 40.889
+
+kernel execution time:  25.1349 ms
+sddmm time: 25.7522
+
+kernel execution time:  46.3853 ms
+spmm time: 46.9556
+
+kernel execution time:  418.693 ms
+taco reference time: 419.345
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  1982.06 ms
+fused time: 1982.93
+
+kernel execution time:  1668.23 ms
+sddmm time: 1668.77
+
+kernel execution time:  962.046 ms
+sddmm time: 962.591
+
+kernel execution time:  1821.97 ms
+spmm time: 1822.46
+
+kernel execution time:  47772.2 ms
+taco reference time: 47773.4
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  1143.12 ms
+fused time: 1144.05
+
+kernel execution time:  1254.57 ms
+sddmm time: 1255.18
+
+kernel execution time:  539.54 ms
+sddmm time: 540.136
+
+kernel execution time:  1005.14 ms
+spmm time: 1005.69
+
+kernel execution time:  25805.1 ms
+taco reference time: 25806.1
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  782.496 ms
+fused time: 783.574
+
+kernel execution time:  872.793 ms
+sddmm time: 873.351
+
+kernel execution time:  353.256 ms
+sddmm time: 353.8
+
+kernel execution time:  606.511 ms
+spmm time: 607.041
+
+kernel execution time:  15198.9 ms
+taco reference time: 15199.9
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  729.345 ms
+fused time: 730.242
+
+kernel execution time:  608.324 ms
+sddmm time: 608.908
+
+kernel execution time:  334.109 ms
+sddmm time: 334.653
+
+kernel execution time:  471.211 ms
+spmm time: 471.77
+
+kernel execution time:  8630.19 ms
+taco reference time: 8631.29
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 5558326, D2_dimension: 64, vals: 355732864
+E1_dimension: 5558326, E2_dimension: 64, vals: 355732864
+
+
+kernel execution time:  736.326 ms
+fused time: 737.203
+
+kernel execution time:  482.639 ms
+sddmm time: 483.19
+
+kernel execution time:  333.58 ms
+sddmm time: 334.131
+
+kernel execution time:  478.49 ms
+spmm time: 479.051
+
+kernel execution time:  7244.99 ms
+taco reference time: 7246.13
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  13.4143 ms
+fused time: 13.9143
+
+kernel execution time:  11.2836 ms
+sddmm time: 12.0149
+
+kernel execution time:  7.35609 ms
+sddmm time: 8.06588
+
+kernel execution time:  7.36916 ms
+spmm time: 7.93476
+
+kernel execution time:  120.287 ms
+taco reference time: 120.948
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  156.322 ms
+fused time: 156.802
+
+kernel execution time:  77.0794 ms
+sddmm time: 77.6574
+
+kernel execution time:  81.2772 ms
+sddmm time: 81.8141
+
+kernel execution time:  74.4419 ms
+spmm time: 74.9538
+
+kernel execution time:  5091.25 ms
+taco reference time: 5092.34
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  160.868 ms
+fused time: 161.347
+
+kernel execution time:  78.1223 ms
+sddmm time: 78.7031
+
+kernel execution time:  82.4929 ms
+sddmm time: 83.0729
+
+kernel execution time:  77.24 ms
+spmm time: 77.7896
+
+kernel execution time:  5087.42 ms
+taco reference time: 5088.53
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  157.627 ms
+fused time: 158.106
+
+kernel execution time:  76.9497 ms
+sddmm time: 77.5265
+
+kernel execution time:  81.9491 ms
+sddmm time: 82.4945
+
+kernel execution time:  81.9841 ms
+spmm time: 82.5149
+
+kernel execution time:  5084.06 ms
+taco reference time: 5085.15
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  156.608 ms
+fused time: 157.085
+
+kernel execution time:  76.6969 ms
+sddmm time: 77.2366
+
+kernel execution time:  80.7238 ms
+sddmm time: 81.2624
+
+kernel execution time:  74.4498 ms
+spmm time: 74.9694
+
+kernel execution time:  5076.16 ms
+taco reference time: 5077.28
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  156.489 ms
+fused time: 156.996
+
+kernel execution time:  77.2215 ms
+sddmm time: 77.7763
+
+kernel execution time:  81.2983 ms
+sddmm time: 81.8357
+
+kernel execution time:  75.4752 ms
+spmm time: 76.0191
+
+kernel execution time:  5087.37 ms
+taco reference time: 5088.51
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 62451, D2_dimension: 64, vals: 3996864
+E1_dimension: 62451, E2_dimension: 64, vals: 3996864
+
+
+kernel execution time:  156.515 ms
+fused time: 156.991
+
+kernel execution time:  76.9797 ms
+sddmm time: 77.5298
+
+kernel execution time:  81.4654 ms
+sddmm time: 82.0017
+
+kernel execution time:  76.1847 ms
+spmm time: 76.693
+
+kernel execution time:  5078.68 ms
+taco reference time: 5079.85
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 64, vals: 320
+D1_dimension: 5, D2_dimension: 64, vals: 320
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 36417, D2_dimension: 64, vals: 2330688
+E1_dimension: 64, E2_dimension: 64, vals: 4096
+
+
+kernel execution time:  115.102 ms
+fused time: 115.803
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 83334, D2_dimension: 128, vals: 10666752
+E1_dimension: 83334, E2_dimension: 128, vals: 10666752
+
+
+kernel execution time:  30.977 ms
+fused time: 35.4912
+
+separate execution
+
+kernel execution time:  26.0898 ms
+sddmm time: 26.6915
+
+kernel execution time:  15.4341 ms
+sddmm time: 16.0058
+
+kernel execution time:  17.7466 ms
+spmm time: 18.2995
+
+reference execution 
+
+kernel execution time:  694.171 ms
+taco reference time: 694.888
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 121192, D2_dimension: 128, vals: 15512576
+E1_dimension: 121192, E2_dimension: 128, vals: 15512576
+
+
+kernel execution time:  52.5109 ms
+fused time: 56.6803
+
+separate execution
+
+kernel execution time:  41.9638 ms
+sddmm time: 42.5925
+
+kernel execution time:  21.3537 ms
+sddmm time: 21.9855
+
+kernel execution time:  25.1185 ms
+spmm time: 25.7047
+
+reference execution 
+
+kernel execution time:  323.01 ms
+taco reference time: 323.699
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 140874, E2_dimension: 128, vals: 18031872
+
+
+kernel execution time:  45.3128 ms
+fused time: 48.4929
+
+separate execution
+
+kernel execution time:  39.7986 ms
+sddmm time: 40.3901
+
+kernel execution time:  20.8296 ms
+sddmm time: 21.432
+
+kernel execution time:  25.0308 ms
+spmm time: 25.5726
+
+reference execution 
+
+kernel execution time:  867.794 ms
+taco reference time: 868.418
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 170998, E2_dimension: 128, vals: 21887744
+
+
+kernel execution time:  34.2915 ms
+fused time: 38.221
+
+separate execution
+
+kernel execution time:  18.8777 ms
+sddmm time: 19.4859
+
+kernel execution time:  12.8794 ms
+sddmm time: 16.5695
+
+kernel execution time:  19.7876 ms
+spmm time: 23.5933
+
+reference execution 
+
+kernel execution time:  114.374 ms
+taco reference time: 115.03
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 83334, D2_dimension: 128, vals: 10666752
+E1_dimension: 83334, E2_dimension: 128, vals: 10666752
+
+
+kernel execution time:  77.2194 ms
+fused time: 78.1408
+
+separate execution
+
+kernel execution time:  28.0545 ms
+sddmm time: 28.625
+
+kernel execution time:  15.7941 ms
+sddmm time: 16.3986
+
+kernel execution time:  18.1167 ms
+spmm time: 18.7055
+
+reference execution 
+
+kernel execution time:  652.088 ms
+taco reference time: 652.794
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 121192, D2_dimension: 128, vals: 15512576
+E1_dimension: 121192, E2_dimension: 128, vals: 15512576
+
+
+kernel execution time:  100.999 ms
+fused time: 104.98
+
+separate execution
+
+kernel execution time:  42.4345 ms
+sddmm time: 43.0804
+
+kernel execution time:  21.5005 ms
+sddmm time: 22.1326
+
+kernel execution time:  25.1479 ms
+spmm time: 25.7284
+
+reference execution 
+
+kernel execution time:  303.541 ms
+taco reference time: 304.249
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 140874, E2_dimension: 128, vals: 18031872
+
+
+kernel execution time:  121.702 ms
+fused time: 122.44
+
+separate execution
+
+kernel execution time:  41.1645 ms
+sddmm time: 41.7679
+
+kernel execution time:  21.4454 ms
+sddmm time: 22.062
+
+kernel execution time:  25.7274 ms
+spmm time: 26.3069
+
+reference execution 
+
+kernel execution time:  838.679 ms
+taco reference time: 839.358
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 170998, E2_dimension: 128, vals: 21887744
+
+
+kernel execution time:  49.6789 ms
+fused time: 53.8345
+
+separate execution
+
+kernel execution time:  19.3289 ms
+sddmm time: 19.9476
+
+kernel execution time:  12.9298 ms
+sddmm time: 16.5522
+
+kernel execution time:  19.7859 ms
+spmm time: 23.3756
+
+reference execution 
+
+kernel execution time:  114.935 ms
+taco reference time: 115.594
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 83334, D2_dimension: 128, vals: 10666752
+E1_dimension: 83334, E2_dimension: 128, vals: 10666752
+
+
+kernel execution time:  29.3495 ms
+fused time: 32.2304
+
+separate execution
+
+kernel execution time:  23.942 ms
+sddmm time: 24.54
+
+kernel execution time:  14.4886 ms
+sddmm time: 16.5358
+
+kernel execution time:  16.8516 ms
+spmm time: 20.2626
+
+reference execution 
+
+kernel execution time:  709.96 ms
+taco reference time: 710.774
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 121192, D2_dimension: 128, vals: 15512576
+E1_dimension: 121192, E2_dimension: 128, vals: 15512576
+
+
+kernel execution time:  58.2762 ms
+fused time: 62.5278
+
+separate execution
+
+kernel execution time:  42.1594 ms
+sddmm time: 42.7262
+
+kernel execution time:  22.1442 ms
+sddmm time: 23.0064
+
+kernel execution time:  25.7924 ms
+spmm time: 26.3623
+
+reference execution 
+
+kernel execution time:  329.572 ms
+taco reference time: 330.27
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 140874, D2_dimension: 128, vals: 18031872
+E1_dimension: 140874, E2_dimension: 128, vals: 18031872
+
+
+kernel execution time:  46.007 ms
+fused time: 50.2274
+
+separate execution
+
+kernel execution time:  41.4699 ms
+sddmm time: 42.0415
+
+kernel execution time:  21.559 ms
+sddmm time: 22.136
+
+kernel execution time:  25.525 ms
+spmm time: 26.0801
+
+reference execution 
+
+kernel execution time:  869.823 ms
+taco reference time: 873.823
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 170998, D2_dimension: 128, vals: 21887744
+E1_dimension: 170998, E2_dimension: 128, vals: 21887744
+
+
+kernel execution time:  33.3907 ms
+fused time: 37.2851
+
+separate execution
+
+kernel execution time:  19.369 ms
+sddmm time: 19.9378
+
+kernel execution time:  12.956 ms
+sddmm time: 15.1889
+
+kernel execution time:  19.8054 ms
+spmm time: 23.5126
+
+reference execution 
+
+kernel execution time:  115.104 ms
+taco reference time: 115.684
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 206500, D2_dimension: 128, vals: 26432000
+E1_dimension: 206500, E2_dimension: 128, vals: 26432000
+
+
+kernel execution time:  45.2869 ms
+fused time: 49.074
+
+separate execution
+
+kernel execution time:  20.8037 ms
+sddmm time: 21.3769
+
+kernel execution time:  18.6117 ms
+sddmm time: 19.1765
+
+kernel execution time:  27.6368 ms
+spmm time: 28.2194
+
+reference execution 
+
+kernel execution time:  157.83 ms
+taco reference time: 158.458
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 1000005, D2_dimension: 128, vals: 128000640
+E1_dimension: 1000005, E2_dimension: 128, vals: 128000640
+
+
+kernel execution time:  133.416 ms
+fused time: 137.603
+
+separate execution
+
+kernel execution time:  50.8463 ms
+sddmm time: 51.4255
+
+kernel execution time:  41.2442 ms
+sddmm time: 41.8788
+
+kernel execution time:  83.4032 ms
+spmm time: 84.052
+
+reference execution 
+
+kernel execution time:  569.216 ms
+taco reference time: 570.035
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 5558326, D2_dimension: 128, vals: 711465728
+E1_dimension: 5558326, E2_dimension: 128, vals: 711465728
+
+
+kernel execution time:  1282.76 ms
+fused time: 1287.59
+
+separate execution
+
+kernel execution time:  606.985 ms
+sddmm time: 607.616
+
+kernel execution time:  561.224 ms
+sddmm time: 561.958
+
+kernel execution time:  874.527 ms
+spmm time: 875.232
+
+reference execution 
+
+kernel execution time:  21707 ms
+taco reference time: 21710.6
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 5, D2_dimension: 128, vals: 640
+E1_dimension: 5, E2_dimension: 128, vals: 640
+
+
+kernel execution time:  3.43602 ms
+fused time: 27.8707
+
+separate execution
+
+kernel execution time:  4107.02 ms
+sddmm time: 4122.77
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 5, D2_dimension: 128, vals: 640
+E1_dimension: 5, E2_dimension: 128, vals: 640
+
+
+kernel execution time:  0.115981 ms
+fused time: 0.499507
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 5, D2_dimension: 128, vals: 640
+E1_dimension: 5, E2_dimension: 128, vals: 640
+
+
+kernel execution time:  0.133052 ms
+fused time: 3.69599
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2700, B2_dimension: 2700, vals: 5400
+C1_dimension: 2700, C2_dimension: 128, vals: 345600
+D1_dimension: 2700, D2_dimension: 128, vals: 345600
+E1_dimension: 2700, E2_dimension: 128, vals: 345600
+
+
+kernel execution time:  0.606469 ms
+fused time: 4.32552
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2700, B2_dimension: 2700, vals: 5400
+C1_dimension: 2700, C2_dimension: 128, vals: 345600
+D1_dimension: 2700, D2_dimension: 128, vals: 345600
+E1_dimension: 2700, E2_dimension: 128, vals: 345600
+
+
+kernel execution time:  0.650529 ms
+fused time: 1.40893
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5400
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.620999 ms
+fused time: 1.38301
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5400
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.652959 ms
+fused time: 3.94184
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.597158 ms
+fused time: 4.27836
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.659809 ms
+fused time: 4.6484
+
+separate execution
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.591018 ms
+fused time: 2.44084
+
+separate execution
+
+kernel execution time:  0.607388 ms
+sddmm time: 0.891202
+
+kernel execution time:  0.857981 ms
+sddmm time: 1.16087
+
+kernel execution time:  0.922992 ms
+spmm time: 1.60378
+
+reference execution 
+
+kernel execution time:  4.47191 ms
+taco reference time: 5.26226
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.658879 ms
+fused time: 4.15402
+
+separate execution
+
+kernel execution time:  0.70888 ms
+sddmm time: 1.21343
+
+kernel execution time:  0.531398 ms
+sddmm time: 1.30729
+
+kernel execution time:  0.965464 ms
+spmm time: 2.35378
+
+reference execution 
+
+kernel execution time:  3.48771 ms
+taco reference time: 7.55141
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.616739 ms
+fused time: 4.4146
+
+separate execution
+
+kernel execution time:  0.556318 ms
+sddmm time: 3.03196
+
+kernel execution time:  0.945623 ms
+sddmm time: 1.89019
+
+kernel execution time:  0.777471 ms
+spmm time: 3.57728
+
+reference execution 
+
+kernel execution time:  3.22827 ms
+taco reference time: 7.39799
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.65531 ms
+fused time: 4.08374
+
+separate execution
+
+kernel execution time:  0.666219 ms
+sddmm time: 1.20641
+
+kernel execution time:  0.941573 ms
+sddmm time: 1.73185
+
+kernel execution time:  1.01493 ms
+spmm time: 1.75608
+
+reference execution 
+
+kernel execution time:  5.25507 ms
+taco reference time: 6.04624
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.670959 ms
+fused time: 1.50328
+
+separate execution
+
+kernel execution time:  0.600268 ms
+sddmm time: 1.32833
+
+kernel execution time:  0.476237 ms
+sddmm time: 0.792151
+
+kernel execution time:  0.781091 ms
+spmm time: 1.10271
+
+reference execution 
+
+kernel execution time:  3.07623 ms
+taco reference time: 3.53829
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.760541 ms
+fused time: 1.49073
+
+separate execution
+
+kernel execution time:  0.639829 ms
+sddmm time: 1.21327
+
+kernel execution time:  0.576218 ms
+sddmm time: 1.14083
+
+kernel execution time:  0.829512 ms
+spmm time: 1.33624
+
+reference execution 
+
+kernel execution time:  4.14591 ms
+taco reference time: 4.82508
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.638949 ms
+fused time: 1.02277
+
+separate execution
+
+kernel execution time:  0.945034 ms
+sddmm time: 1.20456
+
+kernel execution time:  0.6772 ms
+sddmm time: 0.943263
+
+kernel execution time:  0.888033 ms
+spmm time: 1.133
+
+reference execution 
+
+kernel execution time:  3.82989 ms
+taco reference time: 4.18452
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.7361 ms
+fused time: 1.45315
+
+separate execution
+
+kernel execution time:  0.7335 ms
+sddmm time: 1.25184
+
+kernel execution time:  0.642509 ms
+sddmm time: 1.16064
+
+kernel execution time:  1.02361 ms
+spmm time: 1.48614
+
+reference execution 
+
+kernel execution time:  4.12035 ms
+taco reference time: 4.75857
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 334863, B2_dimension: 334863, vals: 777323
+C1_dimension: 334863, C2_dimension: 128, vals: 42862464
+D1_dimension: 334863, D2_dimension: 128, vals: 42862464
+E1_dimension: 334863, E2_dimension: 128, vals: 42862464
+
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 925872
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  66.4595 ms
+fused time: 66.9196
+
+separate execution
+
+kernel execution time:  22.9317 ms
+sddmm time: 23.4738
+
+kernel execution time:  22.4453 ms
+sddmm time: 23.0045
+
+kernel execution time:  44.2796 ms
+spmm time: 44.8052
+
+reference execution 
+
+kernel execution time:  187.6 ms
+taco reference time: 188.247
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  103.551 ms
+fused time: 104.018
+
+separate execution
+
+kernel execution time:  39.9535 ms
+sddmm time: 40.5639
+
+kernel execution time:  39.2683 ms
+sddmm time: 39.8581
+
+kernel execution time:  65.8336 ms
+spmm time: 66.417
+
+reference execution 
+
+kernel execution time:  306.901 ms
+taco reference time: 307.61
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  106.782 ms
+fused time: 107.261
+
+separate execution
+
+kernel execution time:  40.7961 ms
+sddmm time: 41.3604
+
+kernel execution time:  39.8676 ms
+sddmm time: 40.4959
+
+kernel execution time:  66.2656 ms
+spmm time: 66.8105
+
+reference execution 
+
+kernel execution time:  367.416 ms
+taco reference time: 368.086
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  108.809 ms
+fused time: 109.274
+
+separate execution
+
+kernel execution time:  42.2311 ms
+sddmm time: 42.826
+
+kernel execution time:  41.711 ms
+sddmm time: 42.3721
+
+kernel execution time:  65.9512 ms
+spmm time: 66.5647
+
+reference execution 
+
+kernel execution time:  360.581 ms
+taco reference time: 361.225
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  922.149 ms
+fused time: 922.605
+
+separate execution
+
+kernel execution time:  392.18 ms
+sddmm time: 392.716
+
+kernel execution time:  393.251 ms
+sddmm time: 393.777
+
+kernel execution time:  520.496 ms
+spmm time: 521.007
+
+reference execution 
+
+kernel execution time:  9912.29 ms
+taco reference time: 9913.37
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  2.15935 ms
+fused time: 2.88765
+
+separate execution
+
+kernel execution time:  1.09729 ms
+sddmm time: 1.64867
+
+kernel execution time:  0.987463 ms
+sddmm time: 1.50853
+
+kernel execution time:  2.22996 ms
+spmm time: 2.71273
+
+reference execution 
+
+kernel execution time:  29.4617 ms
+taco reference time: 29.8511
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.667108 ms
+fused time: 1.05163
+
+separate execution
+
+kernel execution time:  0.680159 ms
+sddmm time: 0.994963
+
+kernel execution time:  0.611478 ms
+sddmm time: 1.1057
+
+kernel execution time:  0.988313 ms
+spmm time: 1.4939
+
+reference execution 
+
+kernel execution time:  3.64386 ms
+taco reference time: 4.33446
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  0.691709 ms
+fused time: 1.07767
+
+separate execution
+
+kernel execution time:  0.516997 ms
+sddmm time: 0.77957
+
+kernel execution time:  0.458366 ms
+sddmm time: 0.73026
+
+kernel execution time:  0.777811 ms
+spmm time: 1.01678
+
+reference execution 
+
+kernel execution time:  3.47463 ms
+taco reference time: 3.82426
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  104.681 ms
+fused time: 105.128
+
+separate execution
+
+kernel execution time:  39.5478 ms
+sddmm time: 40.1164
+
+kernel execution time:  40.2068 ms
+sddmm time: 40.7802
+
+kernel execution time:  67.2769 ms
+spmm time: 67.8666
+
+reference execution 
+
+kernel execution time:  378.806 ms
+taco reference time: 379.526
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  2.0421 ms
+fused time: 2.77318
+
+separate execution
+
+kernel execution time:  0.890922 ms
+sddmm time: 1.4406
+
+kernel execution time:  0.673509 ms
+sddmm time: 0.955103
+
+kernel execution time:  1.93153 ms
+spmm time: 2.18341
+
+reference execution 
+
+kernel execution time:  33.2851 ms
+taco reference time: 33.6343
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 548551, D2_dimension: 128, vals: 70214528
+E1_dimension: 548551, E2_dimension: 128, vals: 70214528
+
+
+kernel execution time:  913.728 ms
+fused time: 914.178
+
+separate execution
+
+kernel execution time:  389.744 ms
+sddmm time: 390.317
+
+kernel execution time:  389.105 ms
+sddmm time: 389.68
+
+kernel execution time:  520.43 ms
+spmm time: 520.979
+
+reference execution 
+
+kernel execution time:  9970.19 ms
+taco reference time: 9971.18
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  1.81249 ms
+fused time: 2.53831
+
+separate execution
+
+kernel execution time:  1.41327 ms
+sddmm time: 1.9866
+
+kernel execution time:  0.687839 ms
+sddmm time: 0.957583
+
+kernel execution time:  1.99132 ms
+spmm time: 2.2301
+
+reference execution 
+
+kernel execution time:  33.8389 ms
+taco reference time: 34.1855
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  2.08639 ms
+fused time: 2.81403
+
+separate execution
+
+kernel execution time:  0.75901 ms
+sddmm time: 1.27309
+
+kernel execution time:  0.72208 ms
+sddmm time: 1.00494
+
+kernel execution time:  1.95748 ms
+spmm time: 2.20503
+
+reference execution 
+
+kernel execution time:  33.4827 ms
+taco reference time: 33.8347
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  2.09414 ms
+fused time: 2.82691
+
+separate execution
+
+kernel execution time:  1.03623 ms
+sddmm time: 1.58316
+
+kernel execution time:  0.653819 ms
+sddmm time: 0.926463
+
+kernel execution time:  1.88145 ms
+spmm time: 2.12517
+
+reference execution 
+
+kernel execution time:  33.3395 ms
+taco reference time: 33.6915
+
+sddmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 2708, D2_dimension: 128, vals: 346624
+E1_dimension: 2708, E2_dimension: 128, vals: 346624
+
+
+kernel execution time:  1.70968 ms
+fused time: 2.43176
+
+separate execution
+
+kernel execution time:  0.76455 ms
+sddmm time: 1.31209
+
+kernel execution time:  0.664099 ms
+sddmm time: 0.932353
+
+kernel execution time:  1.92536 ms
+spmm time: 2.17072
+
+reference execution 
+
+kernel execution time:  32.5601 ms
+taco reference time: 32.9017
diff --git a/test/stats/spmm-spmm.txt b/test/stats/spmm-spmm.txt
new file mode 100644
index 000000000..329aacd65
--- /dev/null
+++ b/test/stats/spmm-spmm.txt
@@ -0,0 +1,3604 @@
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  303.084 ms
+fused time: 303.842
+
+kernel execution time:  8140.55 ms
+taco reference time: 8141.59
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  269.44 ms
+fused time: 270.181
+
+kernel execution time:  1612.62 ms
+taco reference time: 1613.21
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 121192, vals: 2624331
+D1_dimension: 121192, D2_dimension: 64, vals: 7756288
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, D2_dimension: 64, vals: 320
+
+
+kernel execution time:  0.125431 ms
+fused time: 0.815671
+
+kernel execution time:  0.03254 ms
+taco reference time: 0.828291
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 10974, vals: 428650
+D1_dimension: 10974, D2_dimension: 64, vals: 702336
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 10974, vals: 428650
+D1_dimension: 10974, D2_dimension: 8, vals: 87792
+
+
+kernel execution time:  783.639 ms
+fused time: 784.413
+
+kernel execution time:  25.6025 ms
+taco reference time: 25.9422
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 10974, vals: 428650
+D1_dimension: 10974, D2_dimension: 8, vals: 87792
+
+
+kernel execution time:  3538.49 ms
+fused time: 3539.6
+
+kernel execution time:  544.057 ms
+taco reference time: 544.496
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 10974, vals: 428650
+D1_dimension: 10974, D2_dimension: 8, vals: 87792
+
+
+kernel execution time:  3451.46 ms
+fused time: 3452.59
+
+kernel execution time:  540.889 ms
+taco reference time: 541.34
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 8, vals: 87792
+D1_dimension: 8, D2_dimension: 8, vals: 64
+
+
+kernel execution time:  23.9997 ms
+fused time: 24.715
+
+kernel execution time:  116.717 ms
+taco reference time: 117.038
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 8, vals: 87792
+D1_dimension: 8, D2_dimension: 8, vals: 64
+
+
+kernel execution time:  2.19466 ms
+fused time: 2.91615
+
+kernel execution time:  9.4728 ms
+taco reference time: 10.0292
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 8, vals: 969536
+D1_dimension: 8, D2_dimension: 8, vals: 64
+
+
+kernel execution time:  30.5327 ms
+fused time: 31.2749
+
+kernel execution time:  35.9838 ms
+taco reference time: 36.52
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1803.51 ms
+fused time: 1804.27
+
+kernel execution time:  1976.12 ms
+taco reference time: 1976.69
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  484.907 ms
+fused time: 485.835
+
+kernel execution time:  1567.31 ms
+taco reference time: 1567.89
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  2301.83 ms
+fused time: 2302.58
+
+kernel execution time:  3904.01 ms
+taco reference time: 3905
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  11.7415 ms
+fused time: 12.4648
+
+kernel execution time:  155.192 ms
+taco reference time: 155.893
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 64, vals: 702336
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  6.56465 ms
+fused time: 7.31046
+
+kernel execution time:  1.17042 ms
+sddmm time: 1.68226
+
+kernel execution time:  5.08948 ms
+spmm time: 5.36855
+
+kernel execution time:  124.176 ms
+taco reference time: 124.551
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 64, vals: 2330688
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  25.3076 ms
+fused time: 25.7407
+
+kernel execution time:  14.1922 ms
+sddmm time: 14.7097
+
+kernel execution time:  16.8223 ms
+spmm time: 17.3081
+
+kernel execution time:  1299.07 ms
+taco reference time: 1299.47
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 64, vals: 2997440
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  27.1044 ms
+fused time: 27.5788
+
+kernel execution time:  9.05436 ms
+sddmm time: 9.61561
+
+kernel execution time:  21.401 ms
+spmm time: 21.9403
+
+kernel execution time:  695.617 ms
+taco reference time: 696.166
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 64, vals: 3996864
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  33.1726 ms
+fused time: 33.5921
+
+kernel execution time:  14.8585 ms
+sddmm time: 15.3574
+
+kernel execution time:  28.8622 ms
+spmm time: 29.3477
+
+kernel execution time:  1179.24 ms
+taco reference time: 1179.66
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 64, vals: 5333376
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  50.933 ms
+fused time: 51.3664
+
+kernel execution time:  22.1051 ms
+sddmm time: 22.6231
+
+kernel execution time:  37.9487 ms
+spmm time: 38.4594
+
+kernel execution time:  1793.69 ms
+taco reference time: 1794.18
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 64, vals: 7756288
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  77.6403 ms
+fused time: 78.0713
+
+kernel execution time:  19.9996 ms
+sddmm time: 20.5235
+
+kernel execution time:  55.1072 ms
+spmm time: 55.6382
+
+kernel execution time:  757.71 ms
+taco reference time: 758.251
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 64, vals: 9015936
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  74.448 ms
+fused time: 74.8977
+
+kernel execution time:  28.5447 ms
+sddmm time: 29.0628
+
+kernel execution time:  64.5939 ms
+spmm time: 65.3752
+
+kernel execution time:  2277.84 ms
+taco reference time: 2278.26
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 64, vals: 10943872
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  103.993 ms
+fused time: 104.417
+
+kernel execution time:  13.9953 ms
+sddmm time: 14.4722
+
+kernel execution time:  77.1505 ms
+spmm time: 77.6507
+
+kernel execution time:  277.888 ms
+taco reference time: 278.424
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 64, vals: 13216000
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  122.094 ms
+fused time: 122.526
+
+kernel execution time:  16.3934 ms
+sddmm time: 16.9174
+
+kernel execution time:  93.4293 ms
+spmm time: 93.9709
+
+kernel execution time:  368.185 ms
+taco reference time: 368.744
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 64, vals: 64000320
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  594.481 ms
+fused time: 594.903
+
+kernel execution time:  68.7062 ms
+sddmm time: 69.19
+
+kernel execution time:  456.966 ms
+spmm time: 457.476
+
+kernel execution time:  939.672 ms
+taco reference time: 940.234
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 64, vals: 355732864
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+kernel execution time:  3572.47 ms
+fused time: 3573.32
+
+kernel execution time:  1088.24 ms
+sddmm time: 1088.74
+
+kernel execution time:  2533.08 ms
+spmm time: 2533.64
+
+kernel execution time:  19935.1 ms
+taco reference time: 19936.1
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  28.4416 ms
+fused time: 28.8482
+
+kernel execution time:  58.9151 ms
+sddmm time: 59.3822
+
+kernel execution time:  85.1524 ms
+spmm time: 85.6136
+
+kernel execution time:  3443.24 ms
+taco reference time: 3444.27
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  28.4398 ms
+fused time: 28.9133
+
+kernel execution time:  59.5781 ms
+SpMM time: 60.0552
+
+kernel execution time:  85.038 ms
+GeMM time: 85.49
+
+kernel execution time:  83.589 ms
+Optimized GeMM time: 83.939
+
+kernel execution time:  3425.66 ms
+taco reference time: 3426.56
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  28.1949 ms
+fused time: 28.6047
+
+kernel execution time:  58.8056 ms
+SpMM time: 59.2739
+
+kernel execution time:  85.098 ms
+GeMM time: 85.5677
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  34.4562 ms
+fused time: 35.1247
+
+kernel execution time:  57.8421 ms
+SpMM time: 58.3206
+
+kernel execution time:  84.8243 ms
+GeMM time: 85.2948
+
+kernel execution time:  84.2094 ms
+Optimized GeMM template time: 84.5715
+
+kernel execution time:  3423.26 ms
+taco reference time: 3424.18
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  34.1982 ms
+fused time: 34.9007
+
+kernel execution time:  58.2208 ms
+SpMM time: 58.708
+
+kernel execution time:  85.2639 ms
+GeMM time: 85.7329
+
+kernel execution time:  84.6708 ms
+Optimized GeMM template time: 85.0447
+
+kernel execution time:  3448.38 ms
+taco reference time: 3449.25
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  3.98391 ms
+fused time: 4.78728
+
+kernel execution time:  3.85974 ms
+SpMM time: 4.41484
+
+kernel execution time:  5.20996 ms
+GeMM time: 5.78292
+
+kernel execution time:  85.5005 ms
+Optimized GeMM template time: 85.8224
+
+kernel execution time:  68.5977 ms
+taco reference time: 69.0953
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  35.477 ms
+fused time: 36.1715
+
+kernel execution time:  57.2092 ms
+SpMM time: 57.6862
+
+kernel execution time:  84.9251 ms
+GeMM time: 85.3862
+
+kernel execution time:  84.8529 ms
+Optimized GeMM template time: 85.2333
+
+kernel execution time:  3425.71 ms
+taco reference time: 3426.59
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  35.2755 ms
+fused time: 35.9965
+
+kernel execution time:  57.3952 ms
+SpMM time: 57.8851
+
+kernel execution time:  85.2686 ms
+GeMM time: 85.7356
+
+kernel execution time:  84.5744 ms
+Optimized GeMM template time: 84.9512
+
+kernel execution time:  3429.7 ms
+taco reference time: 3430.52
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  3.98364 ms
+fused time: 4.61817
+
+kernel execution time:  3.85737 ms
+SpMM time: 4.28322
+
+kernel execution time:  5.15902 ms
+GeMM time: 5.6055
+
+kernel execution time:  87.1601 ms
+Optimized GeMM template time: 87.4622
+
+kernel execution time:  69.0316 ms
+taco reference time: 69.4576
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.62195 ms
+fused time: 5.02884
+
+kernel execution time:  4.03094 ms
+SpMM time: 4.41592
+
+kernel execution time:  5.10184 ms
+GeMM time: 5.44766
+
+kernel execution time:  83.6233 ms
+Optimized GeMM template time: 83.895
+
+kernel execution time:  5.3188 ms
+Optimized GeMM template time: 5.65673
+
+kernel execution time:  69.2656 ms
+taco reference time: 69.6404
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.03732 ms
+fused time: 4.69314
+
+kernel execution time:  3.72378 ms
+SpMM time: 4.02627
+
+kernel execution time:  2.04995 ms
+GeMM time: 2.33804
+
+kernel execution time:  2.25997 ms
+Optimized GeMM template time: 2.50901
+
+kernel execution time:  5.18509 ms
+Optimized GeMM template time: 5.46269
+
+kernel execution time:  68.4415 ms
+taco reference time: 68.78
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  3.95981 ms
+fused time: 4.3754
+
+kernel execution time:  3.78475 ms
+SpMM time: 4.19686
+
+kernel execution time:  2.00709 ms
+GeMM time: 2.38028
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.05057 ms
+fused time: 4.40773
+
+kernel execution time:  3.75306 ms
+SpMM time: 4.08598
+
+kernel execution time:  2.05899 ms
+GeMM time: 2.36596
+
+kernel execution time:  2.12928 ms
+Optimized GeMM template time: 2.36493
+
+kernel execution time:  5.14712 ms
+Optimized GeMM template time: 5.41248
+
+kernel execution time:  68.075 ms
+taco reference time: 68.3835
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  3.88934 ms
+fused time: 4.25328
+
+kernel execution time:  3.82407 ms
+SpMM time: 4.19446
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.28741 ms
+fused time: 4.98944
+
+kernel execution time:  3.79765 ms
+SpMM time: 4.16417
+
+kernel execution time:  1.4265 ms
+SpMM template time: 1.74127
+
+kernel execution time:  2.10898 ms
+GeMM time: 2.39285
+
+kernel execution time:  2.34628 ms
+Optimized GeMM template time: 2.61728
+
+kernel execution time:  5.31869 ms
+Optimized GeMM template time: 5.60267
+
+kernel execution time:  69.5098 ms
+taco reference time: 69.8708
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.01588 ms
+fused time: 4.65051
+
+kernel execution time:  3.86258 ms
+SpMM time: 4.2125
+
+kernel execution time:  1.43425 ms
+SpMM template time: 1.72825
+
+kernel execution time:  2.09177 ms
+GeMM time: 2.35741
+
+kernel execution time:  2.03779 ms
+GeMM time: 2.26668
+
+kernel execution time:  2.18152 ms
+Optimized GeMM template time: 2.45788
+
+kernel execution time:  0.974804 ms
+Optimized GeMM template time: 1.25462
+
+kernel execution time:  67.9024 ms
+taco reference time: 68.2452
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.0224 ms
+fused time: 4.44033
+
+kernel execution time:  3.84077 ms
+SpMM time: 4.2196
+
+kernel execution time:  1.57684 ms
+SpMM template time: 1.93604
+
+kernel execution time:  2.00289 ms
+GeMM time: 2.38135
+
+kernel execution time:  1.93219 ms
+ref 2 GeMM time: 2.16952
+
+kernel execution time:  1.9562 ms
+ref3 GeMM template time: 2.22014
+
+kernel execution time:  1.02843 ms
+SpMM template time: 1.3134
+
+kernel execution time:  68.6937 ms
+taco reference time: 69.0531
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.70723 ms
+fused time: 5.10663
+
+kernel execution time:  3.86475 ms
+SpMM time: 4.22896
+
+kernel execution time:  1.5696 ms
+SpMM template time: 1.91027
+
+kernel execution time:  2.06463 ms
+GeMM time: 2.35063
+
+kernel execution time:  1.93837 ms
+ref 2 GeMM time: 2.18475
+
+kernel execution time:  1.93808 ms
+ref3 GeMM template time: 2.21134
+
+kernel execution time:  1.00393 ms
+SpMM template time: 1.28759
+
+kernel execution time:  65.6539 ms
+taco reference time: 66.0123
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  4.41073 ms
+fused time: 4.81175
+
+kernel execution time:  3.96438 ms
+SpMM time: 4.33792
+
+kernel execution time:  1.48077 ms
+SpMM template time: 1.84634
+
+kernel execution time:  2.06276 ms
+GeMM time: 2.52122
+
+kernel execution time:  2.4643 ms
+ref 2 GeMM template time: 3.77443
+
+kernel execution time:  2.21292 ms
+ref3 GeMM template time: 2.48374
+
+kernel execution time:  1.02386 ms
+SpMM template time ref4: 5.63941
+
+kernel execution time:  73.0137 ms
+taco reference time: 73.4188
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.81946 ms
+fused time: 3.44515
+
+kernel execution time:  3.93379 ms
+SpMM time: 4.19505
+
+kernel execution time:  1.46537 ms
+SpMM template time: 1.77106
+
+kernel execution time:  2.48839 ms
+GeMM time: 2.75159
+
+kernel execution time:  2.57119 ms
+ref 2 GeMM template time: 2.83288
+
+kernel execution time:  2.19579 ms
+ref3 GeMM template time: 2.44668
+
+kernel execution time:  1.08977 ms
+SpMM template time ref4: 1.3527
+
+kernel execution time:  72.5212 ms
+taco reference time: 72.8405
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.34088 ms
+fused time: 2.99398
+
+kernel execution time:  3.80606 ms
+SpMM time: 4.36154
+
+kernel execution time:  1.58906 ms
+SpMM template time: 1.95568
+
+kernel execution time:  2.25455 ms
+GeMM time: 2.5356
+
+kernel execution time:  2.3975 ms
+ref 2 GeMM template time: 2.66963
+
+kernel execution time:  2.10202 ms
+ref3 GeMM template time: 2.40392
+
+kernel execution time:  1.02333 ms
+SpMM template time ref4: 1.30975
+
+kernel execution time:  72.6994 ms
+taco reference time: 73.0145
+
+
+
+
+
+
+---------------------------------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------------------------------
+
+
+with 64 threads
+
+
+
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.36795 ms
+fused time: 2.78304
+
+kernel execution time:  3.8721 ms
+SpMM time: 4.20057
+
+kernel execution time:  1.52637 ms
+SpMM template time: 1.85784
+
+kernel execution time:  2.03318 ms
+GeMM time: 2.31935
+
+kernel execution time:  2.39998 ms
+ref 2 GeMM template time: 2.68836
+
+kernel execution time:  1.94819 ms
+ref3 GeMM template time: 2.2353
+
+kernel execution time:  1.06049 ms
+SpMM template time ref4: 1.35755
+
+kernel execution time:  68.6851 ms
+taco reference time: 69.0188
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  8.41302 ms
+fused time: 8.85733
+
+kernel execution time:  17.639 ms
+SpMM time: 18.2378
+
+kernel execution time:  7.98654 ms
+SpMM template time: 8.57087
+
+kernel execution time:  6.34574 ms
+GeMM time: 6.8938
+
+kernel execution time:  6.10335 ms
+ref 2 GeMM template time: 6.39173
+
+kernel execution time:  5.82956 ms
+ref3 GeMM template time: 6.11877
+
+kernel execution time:  4.70653 ms
+SpMM template time ref4: 5.04278
+
+kernel execution time:  671.833 ms
+taco reference time: 672.353
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  7.27388 ms
+fused time: 7.73945
+
+kernel execution time:  17.7256 ms
+SpMM time: 18.3199
+
+kernel execution time:  7.35832 ms
+SpMM template time: 7.9109
+
+kernel execution time:  8.33036 ms
+GeMM time: 8.86966
+
+kernel execution time:  7.86963 ms
+ref 2 GeMM template time: 8.15124
+
+kernel execution time:  7.7866 ms
+ref3 GeMM template time: 8.07407
+
+kernel execution time:  4.49305 ms
+SpMM template time ref4: 4.80781
+
+kernel execution time:  398.926 ms
+taco reference time: 399.478
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  11.3443 ms
+fused time: 11.8147
+
+kernel execution time:  22.2928 ms
+SpMM time: 22.924
+
+kernel execution time:  12.4461 ms
+SpMM template time: 13.0043
+
+kernel execution time:  10.9317 ms
+GeMM time: 11.5006
+
+kernel execution time:  10.7585 ms
+ref 2 GeMM template time: 11.0658
+
+kernel execution time:  11.0196 ms
+ref3 GeMM template time: 11.3149
+
+kernel execution time:  6.90358 ms
+SpMM template time ref4: 7.24984
+
+kernel execution time:  657.038 ms
+taco reference time: 657.641
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  15.2657 ms
+fused time: 15.7013
+
+kernel execution time:  31.6235 ms
+SpMM time: 32.1905
+
+kernel execution time:  16.8006 ms
+SpMM template time: 17.332
+
+kernel execution time:  14.3795 ms
+GeMM time: 14.9199
+
+kernel execution time:  14.4997 ms
+ref 2 GeMM template time: 14.8349
+
+kernel execution time:  14.0983 ms
+ref3 GeMM template time: 14.393
+
+kernel execution time:  9.33791 ms
+SpMM template time ref4: 9.73698
+
+kernel execution time:  903.295 ms
+taco reference time: 903.924
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  27.1267 ms
+fused time: 27.6407
+
+kernel execution time:  52.874 ms
+SpMM time: 53.49
+
+kernel execution time:  25.9708 ms
+SpMM template time: 26.5475
+
+kernel execution time:  20.1295 ms
+GeMM time: 20.7267
+
+kernel execution time:  21.2549 ms
+ref 2 GeMM template time: 21.7256
+
+kernel execution time:  20.7262 ms
+ref3 GeMM template time: 21.1848
+
+kernel execution time:  12.5379 ms
+SpMM template time ref4: 13.0829
+
+kernel execution time:  405.376 ms
+taco reference time: 406.043
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.7136 ms
+fused time: 23.1625
+
+kernel execution time:  49.1418 ms
+SpMM time: 49.7343
+
+kernel execution time:  25.0936 ms
+SpMM template time: 25.604
+
+kernel execution time:  23.6444 ms
+GeMM time: 24.1812
+
+kernel execution time:  24.348 ms
+ref 2 GeMM template time: 24.6837
+
+kernel execution time:  23.9836 ms
+ref3 GeMM template time: 24.2972
+
+kernel execution time:  14.4884 ms
+SpMM template time ref4: 14.8698
+
+kernel execution time:  1154.44 ms
+taco reference time: 1155.04
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  18.015 ms
+fused time: 18.4775
+
+kernel execution time:  56.1907 ms
+SpMM time: 56.8126
+
+kernel execution time:  20.0375 ms
+SpMM template time: 20.5913
+
+kernel execution time:  28.1716 ms
+GeMM time: 28.7647
+
+kernel execution time:  30.484 ms
+ref 2 GeMM template time: 30.9681
+
+kernel execution time:  30.0422 ms
+ref3 GeMM template time: 30.5496
+
+kernel execution time:  10.8925 ms
+SpMM template time ref4: 11.4401
+
+kernel execution time:  162.277 ms
+taco reference time: 162.908
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  23.8637 ms
+fused time: 24.4029
+
+kernel execution time:  69.8832 ms
+SpMM time: 70.504
+
+kernel execution time:  26.8086 ms
+SpMM template time: 27.6336
+
+kernel execution time:  34.2049 ms
+GeMM time: 34.8056
+
+kernel execution time:  34.6783 ms
+ref 2 GeMM template time: 35.183
+
+kernel execution time:  33.8854 ms
+ref3 GeMM template time: 34.3954
+
+kernel execution time:  13.9069 ms
+SpMM template time ref4: 14.4251
+
+kernel execution time:  189.271 ms
+taco reference time: 189.95
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  66.2912 ms
+fused time: 66.8207
+
+kernel execution time:  335.04 ms
+SpMM time: 335.699
+
+kernel execution time:  83.9137 ms
+SpMM template time: 84.5618
+
+kernel execution time:  157.411 ms
+GeMM time: 158.061
+
+kernel execution time:  169.35 ms
+ref 2 GeMM template time: 169.938
+
+kernel execution time:  168.201 ms
+ref3 GeMM template time: 168.762
+
+kernel execution time:  44.531 ms
+SpMM template time ref4: 45.176
+
+kernel execution time:  458.322 ms
+taco reference time: 458.992
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  629.911 ms
+fused time: 630.89
+
+kernel execution time:  2385.92 ms
+SpMM time: 2386.45
+
+kernel execution time:  904.117 ms
+SpMM template time: 904.66
+
+kernel execution time:  867.356 ms
+GeMM time: 867.943
+
+kernel execution time:  946.344 ms
+ref 2 GeMM template time: 946.912
+
+kernel execution time:  951.944 ms
+ref3 GeMM template time: 952.496
+
+kernel execution time:  464.289 ms
+SpMM template time ref4: 464.847
+
+kernel execution time:  19646 ms
+taco reference time: 19647.2
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  65.749 ms
+fused time: 66.2393
+
+kernel execution time:  334.436 ms
+SpMM time: 335.114
+
+kernel execution time:  85.6378 ms
+SpMM template time: 86.2216
+
+kernel execution time:  156.716 ms
+GeMM time: 157.281
+
+kernel execution time:  169.383 ms
+ref 2 GeMM template time: 169.948
+
+kernel execution time:  168.128 ms
+ref3 GeMM template time: 168.722
+
+kernel execution time:  44.3902 ms
+SpMM template time ref4: 44.9859
+
+kernel execution time:  462.089 ms
+taco reference time: 462.747
+
+kernel execution time:  472.176 ms
+taco reference new time: 472.868
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.9203 ms
+fused time: 23.382
+
+kernel execution time:  69.0678 ms
+SpMM time: 69.6771
+
+kernel execution time:  25.7576 ms
+SpMM template time: 26.2883
+
+kernel execution time:  33.838 ms
+GeMM time: 34.3893
+
+kernel execution time:  36.2223 ms
+ref 2 GeMM template time: 36.7099
+
+kernel execution time:  35.9919 ms
+ref3 GeMM template time: 36.5181
+
+kernel execution time:  13.5094 ms
+SpMM template time ref4: 14.0411
+
+kernel execution time:  209.225 ms
+taco reference time: 209.806
+
+kernel execution time:  195.258 ms
+taco reference new time: 195.862
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  23.9941 ms
+fused time: 24.5306
+
+kernel execution time:  70.3118 ms
+SpMM time: 70.9711
+
+kernel execution time:  26.7754 ms
+SpMM template time: 27.3965
+
+kernel execution time:  34.3488 ms
+GeMM time: 34.9449
+
+kernel execution time:  34.9754 ms
+ref 2 GeMM template time: 35.5492
+
+kernel execution time:  34.4524 ms
+ref3 GeMM template time: 35.0358
+
+kernel execution time:  13.8295 ms
+SpMM template time ref4: 14.4023
+
+kernel execution time:  195.316 ms
+taco reference time: 195.985
+
+kernel execution time:  194.321 ms
+taco reference new time: 194.959
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  499.31 ms
+fused time: 500.253
+
+kernel execution time:  1127.92 ms
+SpMM time: 1128.46
+
+kernel execution time:  314.563 ms
+SpMM template time: 315.094
+
+kernel execution time:  1071.42 ms
+GeMM time: 1071.96
+
+kernel execution time:  772.255 ms
+ref 2 GeMM template time: 772.765
+
+kernel execution time:  768.478 ms
+ref3 GeMM template time: 768.998
+
+kernel execution time:  162.934 ms
+SpMM template time ref4: 163.456
+
+kernel execution time:  51182.8 ms
+taco reference time: 51183.7
+
+kernel execution time:  62360.6 ms
+taco reference new time: 62361.5
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  343.987 ms
+fused time: 344.403
+
+kernel execution time:  127.278 ms
+SpMM time: 127.803
+
+kernel execution time:  139.755 ms
+SpMM template time: 140.297
+
+kernel execution time:  1308.19 ms
+GeMM time: 1308.77
+
+kernel execution time:  930.985 ms
+ref 2 GeMM template time: 931.498
+
+kernel execution time:  924.636 ms
+ref3 GeMM template time: 925.164
+
+kernel execution time:  83.9238 ms
+SpMM template time ref4: 84.4508
+
+kernel execution time:  6298.13 ms
+taco reference time: 6299.21
+
+kernel execution time:  7357.04 ms
+taco reference new time: 7358.09
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  404.825 ms
+fused time: 405.271
+
+kernel execution time:  142.933 ms
+SpMM time: 143.48
+
+kernel execution time:  155.193 ms
+SpMM template time: 155.761
+
+kernel execution time:  1572.88 ms
+GeMM time: 1573.41
+
+kernel execution time:  1132.63 ms
+ref 2 GeMM template time: 1133.13
+
+kernel execution time:  1126.54 ms
+ref3 GeMM template time: 1127.06
+
+kernel execution time:  96.7404 ms
+SpMM template time ref4: 97.2437
+
+kernel execution time:  8321.2 ms
+taco reference time: 8322.27
+
+kernel execution time:  9774.76 ms
+taco reference new time: 9775.82
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  1796.56 ms
+fused time: 1797.34
+
+kernel execution time:  429.623 ms
+SpMM time: 430.127
+
+kernel execution time:  406.352 ms
+SpMM template time: 406.855
+
+kernel execution time:  7603.48 ms
+GeMM time: 7604.4
+
+kernel execution time:  5458.44 ms
+ref 2 GeMM template time: 5459.36
+
+kernel execution time:  5413.18 ms
+ref3 GeMM template time: 5414.05
+
+kernel execution time:  266.783 ms
+SpMM template time ref4: 267.276
+
+kernel execution time:  20481.5 ms
+taco reference time: 20482.6
+
+kernel execution time:  23942.3 ms
+taco reference new time: 23943.8
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  11983.1 ms
+fused time: 11984.1
+
+kernel execution time:  14647.3 ms
+SpMM time: 14648.4
+
+kernel execution time:  5779.35 ms
+SpMM template time: 5780.3
+
+kernel execution time:  42156 ms
+GeMM time: 42156.9
+
+kernel execution time:  30315.6 ms
+ref 2 GeMM template time: 30316.6
+
+kernel execution time:  30070.9 ms
+ref3 GeMM template time: 30071.9
+
+kernel execution time:  3196.34 ms
+SpMM template time ref4: 3197.36
+
+kernel execution time:  387963 ms
+taco reference time: 387964
+
+kernel execution time:  481094 ms
+taco reference new time: 481095
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  32.8596 ms
+fused time: 33.2745
+
+kernel execution time:  57.4073 ms
+SpMM time: 57.9242
+
+kernel execution time:  18.9092 ms
+SpMM template time: 19.4238
+
+kernel execution time:  84.8547 ms
+GeMM time: 85.3549
+
+kernel execution time:  60.5468 ms
+ref 2 GeMM template time: 60.9429
+
+kernel execution time:  60.3303 ms
+ref3 GeMM template time: 60.7269
+
+kernel execution time:  9.95693 ms
+SpMM template time ref4: 10.3864
+
+kernel execution time:  2808.32 ms
+taco reference time: 2808.79
+
+kernel execution time:  3456.32 ms
+taco reference new time: 3457.29
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  203.078 ms
+fused time: 203.513
+
+kernel execution time:  594.431 ms
+SpMM time: 594.968
+
+kernel execution time:  135.247 ms
+SpMM template time: 135.774
+
+kernel execution time:  277.557 ms
+GeMM time: 278.077
+
+kernel execution time:  201.246 ms
+ref 2 GeMM template time: 201.741
+
+kernel execution time:  200.173 ms
+ref3 GeMM template time: 200.697
+
+kernel execution time:  67.3815 ms
+SpMM template time ref4: 67.9079
+
+kernel execution time:  28413.2 ms
+taco reference time: 28414.2
+
+kernel execution time:  34685.2 ms
+taco reference new time: 34687
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  156.103 ms
+fused time: 156.534
+
+kernel execution time:  313.946 ms
+SpMM time: 314.545
+
+kernel execution time:  95.9908 ms
+SpMM template time: 96.5235
+
+kernel execution time:  355.516 ms
+GeMM time: 356.043
+
+kernel execution time:  257.486 ms
+ref 2 GeMM template time: 258
+
+kernel execution time:  255.966 ms
+ref3 GeMM template time: 256.498
+
+kernel execution time:  50.7943 ms
+SpMM template time ref4: 51.3121
+
+kernel execution time:  15474.9 ms
+taco reference time: 15476
+
+kernel execution time:  19054.1 ms
+taco reference new time: 19055.3
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  233.01 ms
+fused time: 233.435
+
+kernel execution time:  583.856 ms
+SpMM time: 584.39
+
+kernel execution time:  148.111 ms
+SpMM template time: 148.649
+
+kernel execution time:  474.209 ms
+GeMM time: 474.735
+
+kernel execution time:  343.934 ms
+ref 2 GeMM template time: 344.44
+
+kernel execution time:  342.778 ms
+ref3 GeMM template time: 343.3
+
+kernel execution time:  74.5241 ms
+SpMM template time ref4: 75.0386
+
+kernel execution time:  26129.8 ms
+taco reference time: 26130.9
+
+kernel execution time:  32058.9 ms
+taco reference new time: 32059.8
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  332.296 ms
+fused time: 332.73
+
+kernel execution time:  871.053 ms
+SpMM time: 871.586
+
+kernel execution time:  217.386 ms
+SpMM template time: 217.911
+
+kernel execution time:  636.82 ms
+GeMM time: 637.357
+
+kernel execution time:  461.8 ms
+ref 2 GeMM template time: 462.325
+
+kernel execution time:  458.184 ms
+ref3 GeMM template time: 458.738
+
+kernel execution time:  114.816 ms
+SpMM template time ref4: 115.341
+
+kernel execution time:  39240.9 ms
+taco reference time: 39242
+
+kernel execution time:  48108.4 ms
+taco reference new time: 48109.4
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  351.775 ms
+fused time: 352.201
+
+kernel execution time:  317.447 ms
+SpMM time: 317.983
+
+kernel execution time:  217.205 ms
+SpMM template time: 217.733
+
+kernel execution time:  921.754 ms
+GeMM time: 922.288
+
+kernel execution time:  667.69 ms
+ref 2 GeMM template time: 668.21
+
+kernel execution time:  655.357 ms
+ref3 GeMM template time: 655.888
+
+kernel execution time:  118.018 ms
+SpMM template time ref4: 118.546
+
+kernel execution time:  17243.9 ms
+taco reference time: 17245
+
+kernel execution time:  21353.4 ms
+taco reference new time: 21354.7
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  27502 ms
+fused time: 27581.4
+
+kernel execution time:  19193.1 ms
+SpMM time: 19304.1
+
+kernel execution time:  8528.83 ms
+SpMM template time: 8571.46
+
+kernel execution time:  33685.2 ms
+GeMM time: 33768.7
+
+kernel execution time:  32503 ms
+ref 2 GeMM template time: 32589.2
+
+kernel execution time:  32859.6 ms
+ref3 GeMM template time: 32952.9
+
+kernel execution time:  4862.19 ms
+SpMM template time ref4: 4917.41
+
+kernel execution time:  891084 ms
+taco reference time: 891170
+
+
+
+
+
+
+----------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------
+----------------------------------------------------------------------------------------------------
+
+
+
+
+spmm-spmm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.69255 ms
+fused time: 5.71229
+
+kernel execution time:  3.93158 ms
+SpMM time: 4.42244
+
+1st pattern computation
+
+kernel execution time:  1.69479 ms
+SpMM template time: 2.18137
+
+kernel execution time:  2.53215 ms
+GeMM time: 2.92698
+
+kernel execution time:  82.7455 ms
+ref 2 GeMM template time: 83.6829
+
+2nd pattern computation
+
+kernel execution time:  2.52512 ms
+ref3 GeMM template time: 2.90403
+
+kernel execution time:  1.07835 ms
+SpMM template time ref4: 1.34312
+
+reference pattern computation
+
+kernel execution time:  66.8405 ms
+taco reference time: 67.1485
+
+kernel execution time:  71.5847 ms
+taco reference new time: 71.9261
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 2
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.77205 ms
+fused time: 6.22498
+
+kernel execution time:  3.70735 ms
+SpMM time: 4.15143
+
+1st pattern computation
+
+kernel execution time:  1.68777 ms
+SpMM template time: 2.37238
+
+kernel execution time:  2.64104 ms
+GeMM time: 5.76589
+
+kernel execution time:  81.9899 ms
+ref 2 GeMM template time: 82.2704
+
+2nd pattern computation
+
+kernel execution time:  2.45488 ms
+ref3 GeMM template time: 2.8586
+
+kernel execution time:  1.12289 ms
+SpMM template time ref4: 1.39155
+
+reference pattern computation
+
+kernel execution time:  76.3877 ms
+taco reference time: 78.7939
+
+kernel execution time:  72.755 ms
+taco reference new time: 73.1269
+filenum: 3
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  7.80932 ms
+fused time: 11.2518
+
+kernel execution time:  16.5944 ms
+SpMM time: 17.886
+
+1st pattern computation
+
+kernel execution time:  7.11089 ms
+SpMM template time: 7.68253
+
+kernel execution time:  6.4731 ms
+GeMM time: 9.33681
+
+kernel execution time:  275.759 ms
+ref 2 GeMM template time: 276.631
+
+2nd pattern computation
+
+kernel execution time:  6.3356 ms
+ref3 GeMM template time: 6.81471
+
+kernel execution time:  4.47152 ms
+SpMM template time ref4: 4.76175
+
+reference pattern computation
+
+kernel execution time:  658.29 ms
+taco reference time: 658.76
+
+kernel execution time:  687.782 ms
+taco reference new time: 688.49
+filenum: 4
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  6.78576 ms
+fused time: 8.17823
+
+kernel execution time:  18.7121 ms
+SpMM time: 20.1397
+
+1st pattern computation
+
+kernel execution time:  6.53343 ms
+SpMM template time: 7.11366
+
+kernel execution time:  8.13131 ms
+GeMM time: 10.4823
+
+kernel execution time:  341.676 ms
+ref 2 GeMM template time: 341.986
+
+2nd pattern computation
+
+kernel execution time:  7.69804 ms
+ref3 GeMM template time: 8.15483
+
+kernel execution time:  4.61245 ms
+SpMM template time ref4: 4.90988
+
+reference pattern computation
+
+kernel execution time:  343.367 ms
+taco reference time: 343.755
+
+kernel execution time:  374.197 ms
+taco reference new time: 374.704
+filenum: 5
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  11.6176 ms
+fused time: 15.1115
+
+kernel execution time:  22.6994 ms
+SpMM time: 23.3508
+
+1st pattern computation
+
+kernel execution time:  11.9033 ms
+SpMM template time: 12.4284
+
+kernel execution time:  10.4635 ms
+GeMM time: 10.9336
+
+kernel execution time:  452.62 ms
+ref 2 GeMM template time: 452.931
+
+2nd pattern computation
+
+kernel execution time:  9.29193 ms
+ref3 GeMM template time: 9.74228
+
+kernel execution time:  7.21434 ms
+SpMM template time ref4: 7.5664
+
+reference pattern computation
+
+kernel execution time:  570.857 ms
+taco reference time: 571.396
+
+kernel execution time:  623.78 ms
+taco reference new time: 624.325
+filenum: 6
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  15.2241 ms
+fused time: 17.4586
+
+kernel execution time:  31.7064 ms
+SpMM time: 32.3582
+
+1st pattern computation
+
+kernel execution time:  16.5454 ms
+SpMM template time: 17.0802
+
+kernel execution time:  13.8741 ms
+GeMM time: 14.3707
+
+kernel execution time:  604.662 ms
+ref 2 GeMM template time: 605.002
+
+2nd pattern computation
+
+kernel execution time:  11.9433 ms
+ref3 GeMM template time: 12.403
+
+kernel execution time:  9.77169 ms
+SpMM template time ref4: 10.1324
+
+reference pattern computation
+
+kernel execution time:  841.646 ms
+taco reference time: 842.221
+
+kernel execution time:  932.828 ms
+taco reference new time: 933.378
+filenum: 7
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  25.1981 ms
+fused time: 28.3453
+
+kernel execution time:  51.7019 ms
+SpMM time: 52.3269
+
+1st pattern computation
+
+kernel execution time:  24.2567 ms
+SpMM template time: 24.8204
+
+kernel execution time:  19.9687 ms
+GeMM time: 20.5536
+
+kernel execution time:  874.389 ms
+ref 2 GeMM template time: 874.8
+
+2nd pattern computation
+
+kernel execution time:  17.1428 ms
+ref3 GeMM template time: 17.605
+
+kernel execution time:  12.4989 ms
+SpMM template time ref4: 12.9327
+
+reference pattern computation
+
+kernel execution time:  374.424 ms
+taco reference time: 375.053
+
+kernel execution time:  412.224 ms
+taco reference new time: 412.828
+filenum: 8
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.3642 ms
+fused time: 22.9541
+
+kernel execution time:  48.8361 ms
+SpMM time: 49.478
+
+1st pattern computation
+
+kernel execution time:  24.4919 ms
+SpMM template time: 25.0744
+
+kernel execution time:  23.1278 ms
+GeMM time: 23.714
+
+kernel execution time:  1021.89 ms
+ref 2 GeMM template time: 1022.32
+
+2nd pattern computation
+
+kernel execution time:  19.872 ms
+ref3 GeMM template time: 20.3315
+
+kernel execution time:  14.608 ms
+SpMM template time ref4: 15.077
+
+reference pattern computation
+
+kernel execution time:  1080.68 ms
+taco reference time: 1081.32
+
+kernel execution time:  1211.77 ms
+taco reference new time: 1212.36
+filenum: 9
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  16.318 ms
+fused time: 18.887
+
+kernel execution time:  56.5258 ms
+SpMM time: 57.1171
+
+1st pattern computation
+
+kernel execution time:  18.2007 ms
+SpMM template time: 18.7215
+
+kernel execution time:  28.1041 ms
+GeMM time: 28.6173
+
+kernel execution time:  1232.84 ms
+ref 2 GeMM template time: 1233.26
+
+2nd pattern computation
+
+kernel execution time:  23.6402 ms
+ref3 GeMM template time: 24.1216
+
+kernel execution time:  10.6221 ms
+SpMM template time ref4: 11.1278
+
+reference pattern computation
+
+kernel execution time:  136.61 ms
+taco reference time: 137.191
+
+kernel execution time:  143.222 ms
+taco reference new time: 143.823
+filenum: 10
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.1951 ms
+fused time: 25.4707
+
+kernel execution time:  69.5817 ms
+SpMM time: 70.2133
+
+1st pattern computation
+
+kernel execution time:  25.2229 ms
+SpMM template time: 25.818
+
+kernel execution time:  34.0166 ms
+GeMM time: 34.5719
+
+kernel execution time:  1506.8 ms
+ref 2 GeMM template time: 1507.32
+
+2nd pattern computation
+
+kernel execution time:  27.9513 ms
+ref3 GeMM template time: 28.4381
+
+kernel execution time:  13.4585 ms
+SpMM template time ref4: 14.0168
+
+reference pattern computation
+
+kernel execution time:  182.244 ms
+taco reference time: 182.878
+
+kernel execution time:  191.621 ms
+taco reference new time: 192.28
+filenum: 12
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  62.6358 ms
+fused time: 66.0562
+
+kernel execution time:  331.995 ms
+SpMM time: 332.669
+
+1st pattern computation
+
+kernel execution time:  81.0262 ms
+SpMM template time: 81.6316
+
+kernel execution time:  155.308 ms
+GeMM time: 155.913
+
+kernel execution time:  7174.32 ms
+ref 2 GeMM template time: 7175.38
+
+2nd pattern computation
+
+kernel execution time:  131.848 ms
+ref3 GeMM template time: 132.36
+
+kernel execution time:  43.681 ms
+SpMM template time ref4: 44.293
+
+reference pattern computation
+
+kernel execution time:  444.857 ms
+taco reference time: 445.492
+
+kernel execution time:  467.509 ms
+taco reference new time: 468.15
+filenum: 15
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  621.338 ms
+fused time: 625.05
+
+kernel execution time:  2276.7 ms
+SpMM time: 2277.28
+
+1st pattern computation
+
+kernel execution time:  881.7 ms
+SpMM template time: 882.296
+
+kernel execution time:  859.785 ms
+GeMM time: 860.272
+
+kernel execution time:  39771.6 ms
+ref 2 GeMM template time: 39772.6
+
+2nd pattern computation
+
+kernel execution time:  748.251 ms
+ref3 GeMM template time: 748.758
+
+kernel execution time:  452.61 ms
+SpMM template time ref4: 453.163
+
+reference pattern computation
+
+kernel execution time:  19528.6 ms
+taco reference time: 19529.7
+
+kernel execution time:  26715.2 ms
+taco reference new time: 26716.6
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 2
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 10974, B2_dimension: 10974, vals: 428650
+C1_dimension: 10974, C2_dimension: 128, vals: 1404672
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  2.64213 ms
+fused time: 6.13507
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  3.84056 ms
+SpMM time: 4.24008
+
+kernel execution time:  1.61274 ms
+SpMM template time: 2.04575
+
+kernel execution time:  2.33971 ms
+GeMM time: 2.69705
+
+kernel execution time:  85.2544 ms
+ref 2 GeMM template time: 86.1514
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  2.2757 ms
+ref3 GeMM template time: 2.64863
+
+kernel execution time:  1.04819 ms
+SpMM template time ref4: 1.27491
+
+-------- reference pattern computation
+
+kernel execution time:  69.4126 ms
+taco reference time: 71.9418
+
+kernel execution time:  71.8522 ms
+taco reference new time: 72.137
+filenum: 3
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  7.47716 ms
+fused time: 11.1061
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  16.7215 ms
+SpMM time: 17.3352
+
+kernel execution time:  7.10234 ms
+SpMM template time: 7.68864
+
+kernel execution time:  6.44691 ms
+GeMM time: 9.89357
+
+kernel execution time:  275.868 ms
+ref 2 GeMM template time: 276.795
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  6.21948 ms
+ref3 GeMM template time: 6.86379
+
+kernel execution time:  4.55999 ms
+SpMM template time ref4: 4.85255
+
+-------- reference pattern computation
+
+kernel execution time:  643.662 ms
+taco reference time: 644.221
+
+kernel execution time:  682.88 ms
+taco reference new time: 683.468
+filenum: 4
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 46835, B2_dimension: 46835, vals: 2374001
+C1_dimension: 46835, C2_dimension: 128, vals: 5994880
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  7.25024 ms
+fused time: 11.0411
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  18.4386 ms
+SpMM time: 18.956
+
+kernel execution time:  6.48062 ms
+SpMM template time: 7.03658
+
+kernel execution time:  7.9428 ms
+GeMM time: 9.42206
+
+kernel execution time:  343.414 ms
+ref 2 GeMM template time: 343.746
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  6.9495 ms
+ref3 GeMM template time: 7.40299
+
+kernel execution time:  4.95305 ms
+SpMM template time ref4: 5.26981
+
+-------- reference pattern computation
+
+kernel execution time:  338.889 ms
+taco reference time: 339.74
+
+kernel execution time:  373.621 ms
+taco reference new time: 374.075
+filenum: 5
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 62451, B2_dimension: 62451, vals: 4007383
+C1_dimension: 62451, C2_dimension: 128, vals: 7993728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  11.3714 ms
+fused time: 15.0722
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  22.4213 ms
+SpMM time: 22.9773
+
+kernel execution time:  11.8747 ms
+SpMM template time: 12.4314
+
+kernel execution time:  10.2572 ms
+GeMM time: 12.818
+
+kernel execution time:  451.818 ms
+ref 2 GeMM template time: 452.131
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  9.4658 ms
+ref3 GeMM template time: 9.90856
+
+kernel execution time:  6.97316 ms
+SpMM template time ref4: 7.30846
+
+-------- reference pattern computation
+
+kernel execution time:  543.932 ms
+taco reference time: 544.422
+
+kernel execution time:  623.419 ms
+taco reference new time: 623.935
+filenum: 6
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 83334, B2_dimension: 83334, vals: 6010480
+C1_dimension: 83334, C2_dimension: 128, vals: 10666752
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  15.18 ms
+fused time: 18.5471
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  31.3038 ms
+SpMM time: 31.9251
+
+kernel execution time:  16.4816 ms
+SpMM template time: 17.0655
+
+kernel execution time:  13.7454 ms
+GeMM time: 14.2668
+
+kernel execution time:  601.657 ms
+ref 2 GeMM template time: 602.024
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  14.354 ms
+ref3 GeMM template time: 14.8072
+
+kernel execution time:  9.41569 ms
+SpMM template time ref4: 9.77992
+
+-------- reference pattern computation
+
+kernel execution time:  805.535 ms
+taco reference time: 806.106
+
+kernel execution time:  928.447 ms
+taco reference new time: 928.999
+filenum: 7
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 121192, B2_dimension: 121192, vals: 2624331
+C1_dimension: 121192, C2_dimension: 128, vals: 15512576
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  25.2666 ms
+fused time: 27.8771
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  51.9104 ms
+SpMM time: 52.5127
+
+kernel execution time:  23.9709 ms
+SpMM template time: 24.5371
+
+kernel execution time:  19.8979 ms
+GeMM time: 20.5052
+
+kernel execution time:  878.762 ms
+ref 2 GeMM template time: 879.166
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  16.9454 ms
+ref3 GeMM template time: 17.4072
+
+kernel execution time:  12.6943 ms
+SpMM template time ref4: 13.1204
+
+-------- reference pattern computation
+
+kernel execution time:  356.591 ms
+taco reference time: 357.146
+
+kernel execution time:  408.529 ms
+taco reference new time: 409.172
+filenum: 8
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 140874, B2_dimension: 140874, vals: 7813404
+C1_dimension: 140874, C2_dimension: 128, vals: 18031872
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.2469 ms
+fused time: 22.8567
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  49.6959 ms
+SpMM time: 50.3273
+
+kernel execution time:  24.2333 ms
+SpMM template time: 24.8116
+
+kernel execution time:  23.0719 ms
+GeMM time: 23.6169
+
+kernel execution time:  1017.55 ms
+ref 2 GeMM template time: 1018
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  19.3601 ms
+ref3 GeMM template time: 19.8249
+
+kernel execution time:  14.2804 ms
+SpMM template time ref4: 14.7665
+
+-------- reference pattern computation
+
+kernel execution time:  1048.84 ms
+taco reference time: 1049.44
+
+kernel execution time:  1209.88 ms
+taco reference new time: 1210.47
+filenum: 9
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 170998, B2_dimension: 170998, vals: 958936
+C1_dimension: 170998, C2_dimension: 128, vals: 21887744
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  15.8746 ms
+fused time: 19.813
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  55.9723 ms
+SpMM time: 56.6152
+
+kernel execution time:  17.9806 ms
+SpMM template time: 18.623
+
+kernel execution time:  27.7406 ms
+GeMM time: 28.4557
+
+kernel execution time:  1236.24 ms
+ref 2 GeMM template time: 1236.69
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  23.8143 ms
+ref3 GeMM template time: 24.2887
+
+kernel execution time:  10.5388 ms
+SpMM template time ref4: 11.0342
+
+-------- reference pattern computation
+
+kernel execution time:  131.162 ms
+taco reference time: 131.729
+
+kernel execution time:  142.639 ms
+taco reference new time: 143.262
+filenum: 10
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 206500, B2_dimension: 206500, vals: 1273389
+C1_dimension: 206500, C2_dimension: 128, vals: 26432000
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  22.0414 ms
+fused time: 24.5186
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  69.6038 ms
+SpMM time: 70.136
+
+kernel execution time:  24.6489 ms
+SpMM template time: 25.1488
+
+kernel execution time:  33.413 ms
+GeMM time: 33.9108
+
+kernel execution time:  1497.05 ms
+ref 2 GeMM template time: 1497.51
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  29.3442 ms
+ref3 GeMM template time: 29.8157
+
+kernel execution time:  12.9244 ms
+SpMM template time ref4: 13.3503
+
+-------- reference pattern computation
+
+kernel execution time:  174.347 ms
+taco reference time: 174.811
+
+kernel execution time:  190.408 ms
+taco reference new time: 190.973
+filenum: 12
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536
+C1_dimension: 1000005, C2_dimension: 128, vals: 128000640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  61.219 ms
+fused time: 65.9604
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  329.098 ms
+SpMM time: 329.782
+
+kernel execution time:  80.1902 ms
+SpMM template time: 80.758
+
+kernel execution time:  154.474 ms
+GeMM time: 155.08
+
+kernel execution time:  7192.75 ms
+ref 2 GeMM template time: 7193.76
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  132.057 ms
+ref3 GeMM template time: 132.561
+
+kernel execution time:  43.0394 ms
+SpMM template time ref4: 43.6558
+
+-------- reference pattern computation
+
+kernel execution time:  430.157 ms
+taco reference time: 430.825
+
+kernel execution time:  463.848 ms
+taco reference new time: 464.498
+filenum: 15
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291
+C1_dimension: 5558326, C2_dimension: 128, vals: 711465728
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  602.9 ms
+fused time: 606.764
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  2126.86 ms
+SpMM time: 2127.49
+
+kernel execution time:  871.892 ms
+SpMM template time: 872.491
+
+kernel execution time:  845.837 ms
+GeMM time: 846.363
+
+kernel execution time:  39844.5 ms
+ref 2 GeMM template time: 39845.6
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  740.208 ms
+ref3 GeMM template time: 740.701
+
+kernel execution time:  447.66 ms
+SpMM template time ref4: 448.268
+
+-------- reference pattern computation
+
+kernel execution time:  18669.7 ms
+taco reference time: 18671
+
+kernel execution time:  26729.8 ms
+taco reference new time: 26731.1
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  0.044111 ms
+fused time: 0.69912
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.019191 ms
+SpMM time: 1.30214
+
+kernel execution time:  0.499717 ms
+SpMM template time: 1.01315
+
+kernel execution time:  0.096371 ms
+GeMM time: 0.631739
+
+kernel execution time:  0.070191 ms
+ref 2 GeMM template time: 0.560537
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  0.070901 ms
+ref3 GeMM template time: 0.579358
+
+kernel execution time:  0.02984 ms
+SpMM template time ref4: 0.851161
+
+-------- reference pattern computation
+
+kernel execution time:  0.194393 ms
+taco reference time: 0.628889
+
+kernel execution time:  0.242974 ms
+taco reference new time: 0.667439
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  0.043801 ms
+fused time: 0.685989
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.01878 ms
+SpMM time: 0.861191
+
+kernel execution time:  0.503617 ms
+SpMM template time: 1.00581
+
+kernel execution time:  0.095292 ms
+GeMM time: 0.583898
+
+kernel execution time:  0.070121 ms
+ref 2 GeMM template time: 0.520137
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  0.070641 ms
+ref3 GeMM template time: 0.537688
+
+kernel execution time:  0.035491 ms
+SpMM template time ref4: 0.514717
+
+-------- reference pattern computation
+
+kernel execution time:  0.194192 ms
+taco reference time: 0.618658
+
+kernel execution time:  0.239543 ms
+taco reference new time: 0.655149
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  0.04383 ms
+fused time: 0.680319
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.019891 ms
+SpMM time: 0.72453
+
+kernel execution time:  0.515237 ms
+SpMM template time: 0.995294
+
+kernel execution time:  0.095731 ms
+GeMM time: 0.628018
+
+kernel execution time:  0.071101 ms
+ref 2 GeMM template time: 0.539967
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  0.071171 ms
+ref3 GeMM template time: 0.592848
+
+kernel execution time:  0.029131 ms
+SpMM template time ref4: 0.582288
+
+-------- reference pattern computation
+
+kernel execution time:  0.254484 ms
+taco reference time: 0.768111
+
+kernel execution time:  0.273853 ms
+taco reference new time: 0.781751
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  0.043111 ms
+fused time: 0.676409
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.01898 ms
+SpMM time: 0.836491
+
+kernel execution time:  0.489586 ms
+SpMM template time: 0.969303
+
+kernel execution time:  0.094641 ms
+GeMM time: 0.561697
+
+kernel execution time:  0.070251 ms
+ref 2 GeMM template time: 0.545778
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  0.07045 ms
+ref3 GeMM template time: 0.550897
+
+kernel execution time:  0.0282 ms
+SpMM template time ref4: 0.463227
+
+-------- reference pattern computation
+
+kernel execution time:  0.245783 ms
+taco reference time: 0.761711
+
+kernel execution time:  0.304314 ms
+taco reference new time: 0.834081
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 128, vals: 640
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  0.03874 ms
+fused time: 0.669969
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.019931 ms
+SpMM time: 0.857531
+
+kernel execution time:  0.507936 ms
+SpMM template time: 1.00321
+
+kernel execution time:  0.093961 ms
+GeMM time: 0.727229
+
+kernel execution time:  0.070371 ms
+ref 2 GeMM template time: 0.867451
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  0.069541 ms
+ref3 GeMM template time: 0.546687
+
+kernel execution time:  0.02565 ms
+SpMM template time ref4: 0.541707
+
+-------- reference pattern computation
+
+kernel execution time:  0.195092 ms
+taco reference time: 0.615338
+
+kernel execution time:  0.239653 ms
+taco reference new time: 0.657449
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 3
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+kernel execution time:  202.946 ms
+fused time: 203.369
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  604.532 ms
+SpMM time: 605.081
+
+kernel execution time:  137.88 ms
+SpMM template time: 138.397
+
+kernel execution time:  281.01 ms
+GeMM time: 281.522
+
+kernel execution time:  267.152 ms
+ref 2 GeMM template time: 267.64
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  202.612 ms
+ref3 GeMM template time: 203.13
+
+kernel execution time:  72.1263 ms
+SpMM template time ref4: 72.634
+
+-------- reference pattern computation
+
+kernel execution time:  26464.3 ms
+taco reference time: 26465.4
+
+kernel execution time:  34639.1 ms
+taco reference new time: 34640.2
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 3
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  601.783 ms
+SpMM time: 602.149
+
+kernel execution time:  135.443 ms
+SpMM template time: 135.968
+
+kernel execution time:  277.027 ms
+GeMM time: 277.575
+
+kernel execution time:  262.418 ms
+ref 2 GeMM template time: 262.884
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  200.17 ms
+ref3 GeMM template time: 200.726
+
+kernel execution time:  71.523 ms
+SpMM template time ref4: 72.0077
+
+-------- reference pattern computation
+
+kernel execution time:  26468.2 ms
+taco reference time: 26469.2
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 3
+---------------------------------
+/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 36417, B2_dimension: 36417, vals: 4344765
+C1_dimension: 36417, C2_dimension: 128, vals: 4661376
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  600.837 ms
+SpMM time: 601.215
+
+kernel execution time:  137.481 ms
+SpMM template time: 138.009
+
+kernel execution time:  280.631 ms
+GeMM time: 281.208
+
+kernel execution time:  266.073 ms
+ref 2 GeMM template time: 266.549
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  200.674 ms
+ref3 GeMM template time: 201.238
+
+kernel execution time:  72.8548 ms
+SpMM template time ref4: 73.3562
+
+-------- reference pattern computation
+
+kernel execution time:  26717.7 ms
+taco reference time: 26718.7
+
+kernel execution time:  34613.6 ms
+taco reference new time: 34614.6
+
+kernel execution time:  202.425 ms
+fused time: 203.027
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 3
+---------------------------------
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 0
+---------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.924512 ms
+SpMM time: 1.22967
+
+kernel execution time:  1.23287 ms
+SpMM template time: 1.51353
+
+kernel execution time:  20.7805 ms
+GeMM time: 21.0769
+
+kernel execution time:  19.6116 ms
+ref 2 GeMM template time: 19.8379
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  14.7563 ms
+ref3 GeMM template time: 15.0245
+
+kernel execution time:  0.823641 ms
+SpMM template time ref4: 1.05233
+
+-------- reference pattern computation
+
+kernel execution time:  34.1041 ms
+taco reference time: 34.4607
+
+kernel execution time:  41.9195 ms
+taco reference new time: 42.2061
+
+kernel execution time:  4.76242 ms
+fused time: 5.04101
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  394.8 ms
+SpMM time: 395.503
+
+kernel execution time:  473.148 ms
+SpMM template time: 473.684
+
+kernel execution time:  4117.68 ms
+GeMM time: 4118.6
+
+kernel execution time:  3957.31 ms
+ref 2 GeMM template time: 3958.16
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  3017.13 ms
+ref3 GeMM template time: 3017.67
+
+kernel execution time:  314.652 ms
+SpMM template time ref4: 315.164
+
+-------- reference pattern computation
+
+kernel execution time:  11644.6 ms
+taco reference time: 11645.6
+
+kernel execution time:  14402.6 ms
+taco reference new time: 14403.6
+
+kernel execution time:  1261.33 ms
+fused time: 1261.88
+
+spmm-spmm execution
+
+-----------------------------------------
+filenum: 0
+---------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 2708, B2_dimension: 2708, vals: 5429
+C1_dimension: 2708, C2_dimension: 128, vals: 346624
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  0.209133 ms
+SpMM time: 0.517016
+
+kernel execution time:  0.579748 ms
+SpMM template time: 0.864251
+
+kernel execution time:  1.0574 ms
+GeMM time: 1.37727
+
+kernel execution time:  19.621 ms
+ref 2 GeMM template time: 19.8504
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  1.44618 ms
+ref3 GeMM template time: 1.72243
+
+kernel execution time:  0.384425 ms
+SpMM template time ref4: 0.610708
+
+-------- reference pattern computation
+
+kernel execution time:  3.59893 ms
+taco reference time: 3.95508
+
+kernel execution time:  4.81855 ms
+taco reference new time: 5.10349
+
+kernel execution time:  1.47107 ms
+fused time: 1.90463
+filenum: 1
+---------------------------------
+/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx
+ref(i,l)=B(i,j)*C(i,k)*D(j,k);
+B1_dimension: 548551, B2_dimension: 548551, vals: 1851744
+C1_dimension: 548551, C2_dimension: 128, vals: 70214528
+D1_dimension: 128, D2_dimension: 64, vals: 8192
+
+
+--------- 1st pattern computation TTM, GEMM
+
+kernel execution time:  50.1795 ms
+SpMM time: 50.5567
+
+kernel execution time:  64.2504 ms
+SpMM template time: 64.8179
+
+kernel execution time:  96.8464 ms
+GeMM time: 97.4123
+
+kernel execution time:  3949.87 ms
+ref 2 GeMM template time: 3950.93
+
+--------- 2nd pattern computation GEMM, SpMM
+
+kernel execution time:  123.802 ms
+ref3 GeMM template time: 124.342
+
+kernel execution time:  39.2723 ms
+SpMM template time ref4: 39.8322
+
+-------- reference pattern computation
+
+kernel execution time:  457.271 ms
+taco reference time: 457.979
+
+kernel execution time:  427.194 ms
+taco reference new time: 427.789
+
+kernel execution time:  93.1417 ms
+fused time: 93.7188
diff --git a/test/stats/spmv-spmv.txt b/test/stats/spmv-spmv.txt
new file mode 100644
index 000000000..90b7482e7
--- /dev/null
+++ b/test/stats/spmv-spmv.txt
@@ -0,0 +1,81 @@
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
+
+spmv-spmv execution
+
+-----------------------------------------
+A(i) = B(i,j) * C(j,k) * v(k);
+B1_dimension: 5, B2_dimension: 5, vals: 19
+C1_dimension: 5, C2_dimension: 5, vals: 19
+D1_dimension: 5, vals: 5
+
diff --git a/test/stats/ttm-ttm.txt b/test/stats/ttm-ttm.txt
new file mode 100644
index 000000000..7080af67b
--- /dev/null
+++ b/test/stats/ttm-ttm.txt
@@ -0,0 +1,2924 @@
+ttm-ttm execution
+
+-----------------------------------------
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181
+C1_dimension: 2480308, C2_dimension: 16, vals: 39684928
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  6299.03 ms
+fused time: 6300.12
+
+kernel execution time:  21080.2 ms
+reference time: 21081.3
+
+kernel execution time:  2757.48 ms
+reference time: 2757.94
+
+kernel execution time:  5064.84 ms
+reference time: 5065.87
+
+ttm-ttm execution
+
+-----------------------------------------
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 16, vals: 25715056
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  3709.97 ms
+fused time: 3711.05
+
+kernel execution time:  16159.4 ms
+reference time: 16160.5
+
+kernel execution time:  1773.12 ms
+reference time: 1773.58
+
+kernel execution time:  3030.89 ms
+reference time: 3031.42
+
+ttm-ttm execution 
+
+------------------------------------------
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 16, vals: 461088
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  487.016 ms
+fused time: 487.513
+
+kernel execution time:  11041.9 ms
+reference time: 11043
+
+kernel execution time:  1009.63 ms
+reference time: 1010.12
+
+kernel execution time:  37.1546 ms
+reference time: 37.757
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 16, vals: 407926224
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  11984.9 ms
+fused time: 11985.9
+
+kernel execution time:  34959 ms
+reference time: 34960.1
+
+kernel execution time:  8476.95 ms
+reference time: 8477.9
+
+kernel execution time:  1869.85 ms
+reference time: 1870.39
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 16, vals: 32
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  2730.05 ms
+fused time: 2731.15
+
+kernel execution time:  4167.74 ms
+reference time: 4168.86
+
+kernel execution time:  550.937 ms
+reference time: 551.395
+
+kernel execution time:  2788.55 ms
+reference time: 2789.07
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/1998DARPA.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 16, vals: 380419568
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550
+C1_dimension: 166, C2_dimension: 16, vals: 2656
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  10491.6 ms
+fused time: 10492.7
+
+kernel execution time:  15968 ms
+reference time: 15969.1
+
+kernel execution time:  1886.09 ms
+reference time: 1886.55
+
+kernel execution time:  10763.7 ms
+reference time: 10765
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 16, vals: 380419568
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  847.087 ms
+fused time: 847.588
+
+kernel execution time:  7136.54 ms
+reference time: 7137.57
+
+kernel execution time:  1340.45 ms
+reference time: 1340.91
+
+kernel execution time:  8.28247 ms
+reference time: 8.80899
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770
+C1_dimension: 532, C2_dimension: 16, vals: 8512
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550
+C1_dimension: 166, C2_dimension: 16, vals: 2656
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  10540.6 ms
+fused time: 10541.6
+
+kernel execution time:  16072 ms
+reference time: 16073
+
+kernel execution time:  1900.39 ms
+reference time: 1900.89
+
+kernel execution time:  10819.5 ms
+reference time: 10820.5
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770
+C1_dimension: 532, C2_dimension: 16, vals: 8512
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 16, vals: 25715056
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  3689.85 ms
+fused time: 3690.99
+
+kernel execution time:  16162.6 ms
+reference time: 16163.7
+
+kernel execution time:  2035.42 ms
+TTM1: 2035.96
+
+kernel execution time:  3004.2 ms
+TTM2: 3004.74
+
+kernel execution time:  147.233 ms
+dense: 147.648
+
+kernel execution time:  2240.45 ms
+TTM after dense: 2240.96
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 16, vals: 25715056
+D1_dimension: 16, D2_dimension: 1024, vals: 16384
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 16, vals: 461088
+D1_dimension: 16, D2_dimension: 32, vals: 512
+
+
+kernel execution time:  542.361 ms
+fused time: 542.813
+
+kernel execution time:  22547.6 ms
+reference time: 22548.6
+
+kernel execution time:  1008.25 ms
+TTM1: 1008.82
+
+kernel execution time:  70.7434 ms
+TTM2: 71.2926
+
+kernel execution time:  5.2174 ms
+dense: 5.58699
+
+kernel execution time:  2086.85 ms
+TTM after dense: 2087.25
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 16, vals: 461088
+D1_dimension: 16, D2_dimension: 16, vals: 256
+
+
+kernel execution time:  531.924 ms
+fused time: 532.696
+
+kernel execution time:  11314 ms
+reference time: 11315.1
+
+kernel execution time:  1009.54 ms
+TTM1: 1010.08
+
+kernel execution time:  37.5466 ms
+TTM2: 38.0867
+
+kernel execution time:  2.77519 ms
+dense: 3.13589
+
+kernel execution time:  1014.37 ms
+TTM after dense: 1014.74
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 16, vals: 461088
+D1_dimension: 16, D2_dimension: 64, vals: 1024
+
+
+kernel execution time:  604.787 ms
+fused time: 605.25
+
+kernel execution time:  45011.1 ms
+reference time: 45012.2
+
+kernel execution time:  1008.41 ms
+TTM1: 1008.97
+
+kernel execution time:  137.791 ms
+TTM2: 138.316
+
+kernel execution time:  10.0591 ms
+dense: 10.4452
+
+kernel execution time:  5120.5 ms
+TTM after dense: 5121.57
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.129572 ms
+fused time: 0.560598
+
+kernel execution time:  0.151942 ms
+reference time: 0.999013
+
+kernel execution time:  0.01803 ms
+TTM1: 0.310364
+
+kernel execution time:  0.119052 ms
+TTM2: 0.897713
+
+kernel execution time:  0.093421 ms
+dense: 0.284444
+
+kernel execution time:  0.032111 ms
+TTM after dense: 0.662509
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.136562 ms
+fused time: 0.555088
+
+kernel execution time:  0.155282 ms
+reference time: 1.02811
+
+kernel execution time:  0.01913 ms
+TTM1: 0.293014
+
+kernel execution time:  0.148032 ms
+TTM2: 1.08159
+
+kernel execution time:  0.093351 ms
+dense: 0.282434
+
+kernel execution time:  0.03336 ms
+TTM after dense: 0.309775
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.133302 ms
+fused time: 0.590248
+
+kernel execution time:  0.154633 ms
+reference time: 0.976683
+
+kernel execution time:  0.032061 ms
+TTM1: 0.554668
+
+kernel execution time:  0.231943 ms
+TTM2: 0.790901
+
+kernel execution time:  0.093152 ms
+dense: 0.456727
+
+kernel execution time:  0.168413 ms
+TTM after dense: 0.866702
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.211383 ms
+fused time: 0.979204
+
+kernel execution time:  0.300854 ms
+reference time: 0.976764
+
+kernel execution time:  0.03182 ms
+TTM1: 0.986423
+
+kernel execution time:  0.223513 ms
+TTM2: 1.25582
+
+kernel execution time:  0.140142 ms
+dense: 0.491247
+
+kernel execution time:  0.057651 ms
+TTM after dense: 0.632639
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.226813 ms
+fused time: 0.981434
+
+kernel execution time:  0.299435 ms
+reference time: 0.980784
+
+kernel execution time:  0.03171 ms
+TTM1: 1.17345
+
+kernel execution time:  0.236723 ms
+TTM2: 1.08452
+
+kernel execution time:  0.099581 ms
+dense: 0.448246
+
+kernel execution time:  0.055691 ms
+TTM after dense: 0.595948
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.183452 ms
+fused time: 0.934223
+
+kernel execution time:  0.258304 ms
+reference time: 1.14423
+
+kernel execution time:  0.028031 ms
+TTM1: 0.530247
+
+kernel execution time:  0.192393 ms
+TTM2: 0.865752
+
+kernel execution time:  0.104401 ms
+dense: 0.458676
+
+kernel execution time:  0.058181 ms
+TTM after dense: 0.641949
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.212263 ms
+fused time: 1.00447
+
+kernel execution time:  0.293174 ms
+reference time: 1.00466
+
+kernel execution time:  0.03429 ms
+TTM1: 1.06194
+
+kernel execution time:  0.227643 ms
+TTM2: 0.77555
+
+kernel execution time:  0.093021 ms
+dense: 0.615169
+
+kernel execution time:  0.111302 ms
+TTM after dense: 1.19147
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.126042 ms
+fused time: 0.542138
+
+kernel execution time:  0.170263 ms
+reference time: 0.974603
+
+kernel execution time:  0.01972 ms
+TTM1: 0.286434
+
+kernel execution time:  0.125282 ms
+TTM2: 0.402736
+
+kernel execution time:  0.103582 ms
+dense: 0.7661
+
+kernel execution time:  0.04149 ms
+TTM after dense: 0.320775
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.193463 ms
+fused time: 0.831391
+
+kernel execution time:  0.347254 ms
+reference time: 1.12168
+
+kernel execution time:  0.03811 ms
+TTM1: 1.19729
+
+kernel execution time:  0.334915 ms
+TTM2: 1.14708
+
+kernel execution time:  0.109681 ms
+dense: 0.526707
+
+kernel execution time:  0.140412 ms
+TTM after dense: 0.76001
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.147722 ms
+fused time: 0.7865
+
+kernel execution time:  0.237434 ms
+reference time: 1.01788
+
+kernel execution time:  0.020341 ms
+TTM1: 0.330005
+
+kernel execution time:  0.201823 ms
+TTM2: 1.01705
+
+kernel execution time:  0.069931 ms
+dense: 0.261943
+
+kernel execution time:  0.032231 ms
+TTM after dense: 0.314845
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.210293 ms
+fused time: 0.999243
+
+kernel execution time:  0.577188 ms
+reference time: 1.23453
+
+kernel execution time:  0.032071 ms
+TTM1: 0.965223
+
+kernel execution time:  0.227183 ms
+TTM2: 1.25077
+
+kernel execution time:  0.091622 ms
+dense: 0.449416
+
+kernel execution time:  0.04494 ms
+TTM after dense: 0.73161
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.109392 ms
+fused time: 0.481746
+
+kernel execution time:  0.242474 ms
+reference time: 0.72963
+
+kernel execution time:  0.01624 ms
+TTM1: 0.257934
+
+kernel execution time:  0.089982 ms
+TTM2: 0.341365
+
+kernel execution time:  0.106392 ms
+dense: 0.74066
+
+kernel execution time:  0.027241 ms
+TTM after dense: 0.277864
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.938612 ms
+fused time: 1.66032
+
+kernel execution time:  0.598878 ms
+reference time: 1.2444
+
+kernel execution time:  0.027881 ms
+TTM1: 0.664309
+
+kernel execution time:  0.172162 ms
+TTM2: 1.0861
+
+kernel execution time:  0.087052 ms
+dense: 0.420256
+
+kernel execution time:  0.044921 ms
+TTM after dense: 0.669959
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.723749 ms
+fused time: 1.52668
+
+kernel execution time:  1.33287 ms
+reference time: 2.02148
+
+kernel execution time:  0.03285 ms
+TTM1: 1.06994
+
+kernel execution time:  0.227263 ms
+TTM2: 1.00641
+
+kernel execution time:  0.121451 ms
+dense: 0.410656
+
+kernel execution time:  0.046891 ms
+TTM after dense: 0.612258
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.654879 ms
+fused time: 1.0716
+
+kernel execution time:  1.24327 ms
+reference time: 1.59976
+
+kernel execution time:  0.691129 ms
+TTM1: 1.0059
+
+kernel execution time:  0.859771 ms
+TTM2: 1.1516
+
+kernel execution time:  0.136762 ms
+dense: 0.334665
+
+kernel execution time:  0.524517 ms
+TTM after dense: 0.806231
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  891.501 ms
+fused time: 892.508
+
+kernel execution time:  6378.22 ms
+reference time: 6379.42
+
+kernel execution time:  265.033 ms
+TTM1: 265.676
+
+kernel execution time:  514.397 ms
+TTM2: 515.1
+
+kernel execution time:  70.5991 ms
+dense: 71.0624
+
+kernel execution time:  541.878 ms
+TTM after dense: 542.548
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  753.49 ms
+fused time: 754.615
+
+kernel execution time:  1394.55 ms
+reference time: 1395.28
+
+kernel execution time:  197.246 ms
+TTM1: 197.894
+
+kernel execution time:  503.301 ms
+TTM2: 503.886
+
+kernel execution time:  0.0622 ms
+dense: 1.00584
+
+kernel execution time:  380.931 ms
+TTM after dense: 381.331
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  894.532 ms
+fused time: 895.512
+
+kernel execution time:  6345.62 ms
+reference time: 6346.77
+
+kernel execution time:  266.55 ms
+TTM1: 267.22
+
+kernel execution time:  515.257 ms
+TTM2: 515.893
+
+kernel execution time:  70.7658 ms
+dense: 71.2374
+
+kernel execution time:  542.175 ms
+TTM after dense: 542.864
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  49.8694 ms
+fused time: 50.6512
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1309.77 ms
+fused time: 1310.84
+
+kernel execution time:  8179.4 ms
+reference time: 8180.68
+
+kernel execution time:  805.812 ms
+TTM1: 806.562
+
+kernel execution time:  314.204 ms
+TTM2: 314.751
+
+kernel execution time:  1134.47 ms
+dense: 1134.93
+
+kernel execution time:  1621.3 ms
+TTM after dense: 1621.92
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  749.757 ms
+fused time: 750.843
+
+kernel execution time:  1391.56 ms
+reference time: 1392.35
+
+kernel execution time:  196.711 ms
+TTM1: 197.347
+
+kernel execution time:  502.61 ms
+TTM2: 503.193
+
+kernel execution time:  0.063271 ms
+dense: 0.948892
+
+kernel execution time:  381.132 ms
+TTM after dense: 381.508
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  230.973 ms
+fused time: 231.921
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.72187 ms
+fused time: 1.46707
+
+kernel execution time:  0.842291 ms
+reference time: 1.52295
+
+kernel execution time:  0.490417 ms
+TTM1: 1.08223
+
+kernel execution time:  0.653919 ms
+TTM2: 1.17803
+
+kernel execution time:  0.115332 ms
+dense: 0.889372
+
+kernel execution time:  0.446076 ms
+TTM after dense: 1.05921
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 16, vals: 400
+D1_dimension: 16, D2_dimension: 32, vals: 512
+
+
+kernel execution time:  1.29819 ms
+fused time: 2.11481
+
+kernel execution time:  0.560877 ms
+reference time: 1.26788
+
+kernel execution time:  0.506967 ms
+TTM1: 1.14189
+
+kernel execution time:  0.547697 ms
+TTM2: 1.24278
+
+kernel execution time:  0.075421 ms
+dense: 0.508546
+
+kernel execution time:  0.464356 ms
+TTM after dense: 1.09434
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 16, vals: 380419568
+D1_dimension: 16, D2_dimension: 32, vals: 512
+
+
+kernel execution time:  126.199 ms
+fused time: 126.724
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 16, vals: 380419568
+D1_dimension: 16, D2_dimension: 32, vals: 512
+
+
+kernel execution time:  132.543 ms
+fused time: 133.165
+
+kernel execution time:  2405.44 ms
+reference time: 2406.19
+
+kernel execution time:  331.61 ms
+TTM1: 332.199
+
+kernel execution time:  2.26417 ms
+TTM2: 3.02615
+
+kernel execution time:  400.791 ms
+dense: 401.064
+
+kernel execution time:  620.74 ms
+TTM after dense: 621.389
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 16, vals: 25715056
+D1_dimension: 16, D2_dimension: 32, vals: 512
+
+
+kernel execution time:  455.645 ms
+fused time: 456.696
+
+kernel execution time:  718.699 ms
+reference time: 719.384
+
+kernel execution time:  142.557 ms
+TTM1: 143.105
+
+kernel execution time:  256.179 ms
+TTM2: 256.785
+
+kernel execution time:  29.5586 ms
+dense: 30.0451
+
+kernel execution time:  269.529 ms
+TTM after dense: 270.186
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  890.318 ms
+fused time: 891.345
+
+kernel execution time:  2038.26 ms
+reference time: 2038.96
+
+kernel execution time:  265.076 ms
+TTM1: 265.783
+
+kernel execution time:  544.765 ms
+TTM2: 545.423
+
+kernel execution time:  70.9058 ms
+dense: 71.4509
+
+kernel execution time:  541.442 ms
+TTM after dense: 542.115
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 64, vals: 128
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  902.466 ms
+fused time: 903.626
+
+kernel execution time:  1051.52 ms
+reference time: 1052.27
+
+kernel execution time:  385.619 ms
+TTM1: 386.243
+
+kernel execution time:  937.648 ms
+TTM2: 938.212
+
+kernel execution time:  0.067901 ms
+dense: 1.00372
+
+kernel execution time:  380.193 ms
+TTM after dense: 380.613
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 64, vals: 128
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 64, vals: 128
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  898.295 ms
+fused time: 899.297
+
+kernel execution time:  1037.66 ms
+reference time: 1038.39
+
+kernel execution time:  385.768 ms
+TTM1: 386.452
+
+kernel execution time:  939.137 ms
+TTM2: 939.74
+
+kernel execution time:  0.073171 ms
+dense: 1.20129
+
+kernel execution time:  383.479 ms
+TTM after dense: 384.01
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 64, vals: 102860224
+D1_dimension: 64, D2_dimension: 64, vals: 4096
+
+
+kernel execution time:  1034.06 ms
+fused time: 1035.05
+
+kernel execution time:  4275.39 ms
+reference time: 4276.62
+
+kernel execution time:  516.765 ms
+TTM1: 517.518
+
+kernel execution time:  1048.69 ms
+TTM2: 1049.32
+
+kernel execution time:  119.233 ms
+dense: 119.711
+
+kernel execution time:  546.744 ms
+TTM after dense: 547.412
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  894.088 ms
+fused time: 895.234
+
+kernel execution time:  2025.29 ms
+reference time: 2025.92
+
+kernel execution time:  264.446 ms
+TTM1: 265.069
+
+kernel execution time:  541.153 ms
+TTM2: 541.71
+
+kernel execution time:  70.7936 ms
+dense: 71.2153
+
+kernel execution time:  542.474 ms
+TTM after dense: 543.104
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 16, vals: 25715056
+D1_dimension: 16, D2_dimension: 64, vals: 1024
+
+
+kernel execution time:  871.496 ms
+fused time: 872.523
+
+kernel execution time:  1340.14 ms
+reference time: 1340.84
+
+kernel execution time:  143.439 ms
+TTM1: 143.995
+
+kernel execution time:  459.09 ms
+TTM2: 459.668
+
+kernel execution time:  51.7433 ms
+dense: 52.1957
+
+kernel execution time:  545.092 ms
+TTM after dense: 545.899
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  893.815 ms
+fused time: 894.866
+
+kernel execution time:  2016.15 ms
+reference time: 2016.8
+
+kernel execution time:  266.599 ms
+TTM1: 267.18
+
+kernel execution time:  544.015 ms
+TTM2: 544.597
+
+kernel execution time:  70.7604 ms
+dense: 71.1854
+
+kernel execution time:  543.212 ms
+TTM after dense: 543.879
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  47.6087 ms
+fused time: 48.0666
+
+kernel execution time:  2381.79 ms
+reference time: 2382.51
+
+kernel execution time:  85.3431 ms
+TTM1: 86.158
+
+kernel execution time:  8.56212 ms
+TTM2: 9.19594
+
+kernel execution time:  1.27998 ms
+dense: 1.66095
+
+kernel execution time:  185.324 ms
+TTM after dense: 185.729
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1312.78 ms
+fused time: 1313.78
+
+kernel execution time:  3548.92 ms
+reference time: 3550.02
+
+kernel execution time:  794.193 ms
+TTM1: 794.835
+
+kernel execution time:  371.233 ms
+TTM2: 371.853
+
+kernel execution time:  1136.25 ms
+dense: 1136.73
+
+kernel execution time:  1608.81 ms
+TTM after dense: 1609.49
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  749.836 ms
+fused time: 750.93
+
+kernel execution time:  566.457 ms
+reference time: 567.141
+
+kernel execution time:  197.095 ms
+TTM1: 197.696
+
+kernel execution time:  503.839 ms
+TTM2: 504.407
+
+kernel execution time:  0.05955 ms
+dense: 0.911152
+
+kernel execution time:  382.185 ms
+TTM after dense: 382.591
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  226.079 ms
+fused time: 227.028
+
+kernel execution time:  8763.95 ms
+reference time: 8765.15
+
+kernel execution time:  605.807 ms
+TTM1: 606.7
+
+kernel execution time:  5.27951 ms
+TTM2: 5.94312
+
+kernel execution time:  1075.36 ms
+dense: 1075.63
+
+kernel execution time:  1244.1 ms
+TTM after dense: 1244.76
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550
+C1_dimension: 166, C2_dimension: 32, vals: 5312
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164
+C1_dimension: 2480308, C2_dimension: 32, vals: 79369856
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  14896.3 ms
+fused time: 14897.5
+
+kernel execution time:  94041.2 ms
+reference time: 94042.2
+
+kernel execution time:  3578.66 ms
+TTM1: 3579.61
+
+kernel execution time:  18883.5 ms
+TTM2: 18884.5
+
+kernel execution time:  2197.87 ms
+dense: 2198.28
+
+kernel execution time:  7686.45 ms
+TTM after dense: 7687.46
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1072.87 ms
+fused time: 1073.82
+
+kernel execution time:  71021.8 ms
+reference time: 71022.9
+
+kernel execution time:  1996.05 ms
+TTM1: 1996.58
+
+kernel execution time:  231.665 ms
+TTM2: 232.177
+
+kernel execution time:  40.2369 ms
+dense: 40.6304
+
+kernel execution time:  4971.71 ms
+TTM after dense: 4972.6
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  29074.9 ms
+fused time: 29076
+
+kernel execution time:  148072 ms
+reference time: 148073
+
+kernel execution time:  13571.2 ms
+TTM1: 13572.2
+
+kernel execution time:  11698.5 ms
+TTM2: 11699.5
+
+kernel execution time:  34736.9 ms
+dense: 34737.7
+
+kernel execution time:  22283.6 ms
+TTM after dense: 22284.5
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  12513.9 ms
+fused time: 12515
+
+kernel execution time:  23535.3 ms
+reference time: 23536.3
+
+kernel execution time:  1334.33 ms
+TTM1: 1334.87
+
+kernel execution time:  17560.3 ms
+TTM2: 17561.3
+
+kernel execution time:  0.019291 ms
+dense: 0.885501
+
+kernel execution time:  3394.59 ms
+TTM after dense: 3395.34
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1517.3 ms
+fused time: 1518.25
+
+kernel execution time:  45929.9 ms
+reference time: 45930.9
+
+kernel execution time:  2929.29 ms
+TTM1: 2929.82
+
+kernel execution time:  53.4282 ms
+TTM2: 53.9625
+
+kernel execution time:  32592.7 ms
+dense: 32593.5
+
+kernel execution time:  6277.64 ms
+TTM after dense: 6278.68
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.852321 ms
+fused time: 1.60101
+
+kernel execution time:  0.662379 ms
+reference time: 1.32203
+
+kernel execution time:  0.511427 ms
+TTM1: 1.03372
+
+kernel execution time:  0.667709 ms
+TTM2: 1.20996
+
+kernel execution time:  0.118331 ms
+dense: 0.542977
+
+kernel execution time:  0.483187 ms
+TTM after dense: 0.900252
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125
+C1_dimension: 25, C2_dimension: 32, vals: 800
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  0.671739 ms
+fused time: 4.90845
+
+kernel execution time:  0.711039 ms
+reference time: 5.04208
+
+kernel execution time:  0.486907 ms
+reference new time: 4.37081
+
+kernel execution time:  0.482627 ms
+TTM1: 3.67761
+
+kernel execution time:  0.589078 ms
+TTM2: 4.27397
+
+kernel execution time:  0.095461 ms
+dense: 0.492616
+
+kernel execution time:  0.530937 ms
+TTM after dense: 1.0284
+
+ttm-ttm execution
+
+-----------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  881.367 ms
+fused time: 886.111
+
+reference impl time 
+
+kernel execution time:  2050.43 ms
+reference time: 2051.08
+
+kernel execution time:  2002.9 ms
+reference new time: 2003.54
+
+kernel execution time:  260.701 ms
+TTM1: 261.277
+
+kernel execution time:  539.892 ms
+TTM2: 540.489
+
+kernel execution time:  69.5675 ms
+dense: 70.0315
+
+kernel execution time:  531.744 ms
+TTM after dense: 532.375
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  46.1273 ms
+fused time: 50.9231
+
+reference impl time 
+
+kernel execution time:  2363.18 ms
+reference time: 2364.02
+
+kernel execution time:  2340.56 ms
+reference new time: 2341.2
+
+kernel execution time:  82.5312 ms
+TTM1: 83.1034
+
+kernel execution time:  8.62143 ms
+TTM2: 9.16734
+
+kernel execution time:  1.20538 ms
+dense: 1.48454
+
+kernel execution time:  181.488 ms
+TTM after dense: 181.827
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  874.724 ms
+fused time: 878.246
+
+reference impl time 
+
+kernel execution time:  2042.51 ms
+reference time: 2043.27
+
+kernel execution time:  46819.7 ms
+reference new time: 46820.8
+
+schedule 1
+
+kernel execution time:  260.841 ms
+TTM1: 261.378
+
+kernel execution time:  539.264 ms
+TTM2: 539.834
+
+schedule 2
+
+kernel execution time:  69.2965 ms
+dense: 69.7197
+
+kernel execution time:  532.774 ms
+TTM after dense: 535.64
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  51.3316 ms
+fused time: 55.9685
+
+reference impl time 
+
+kernel execution time:  2363.6 ms
+reference time: 2364.38
+
+kernel execution time:  31523.9 ms
+reference new time: 31525
+
+schedule 1
+
+kernel execution time:  84.4692 ms
+TTM1: 84.9774
+
+kernel execution time:  7.9451 ms
+TTM2: 8.49167
+
+schedule 2
+
+kernel execution time:  1.17918 ms
+dense: 1.49638
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310
+C1_dimension: 1607191, C2_dimension: 32, vals: 51430112
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  877.727 ms
+fused time: 881.892
+
+reference impl time 
+
+kernel execution time:  1998.47 ms
+reference time: 1999.14
+
+kernel execution time:  1818.14 ms
+reference new time: 1818.77
+
+schedule 1
+
+kernel execution time:  261.202 ms
+TTM1: 261.759
+
+kernel execution time:  539.615 ms
+TTM2: 540.183
+
+schedule 2
+
+kernel execution time:  69.7746 ms
+dense: 70.1943
+
+kernel execution time:  532.374 ms
+TTM after dense: 533.008
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419
+C1_dimension: 28818, C2_dimension: 32, vals: 922176
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  42.811 ms
+fused time: 47.6618
+
+reference impl time 
+
+kernel execution time:  2267.84 ms
+reference time: 2268.63
+
+kernel execution time:  1379.49 ms
+reference new time: 1380.15
+
+schedule 1
+
+kernel execution time:  81.6849 ms
+TTM1: 82.4365
+
+kernel execution time:  9.74645 ms
+TTM2: 10.2848
+
+schedule 2
+
+kernel execution time:  1.47367 ms
+dense: 1.78443
+
+kernel execution time:  208.263 ms
+TTM after dense: 210.169
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1299.91 ms
+fused time: 1303.65
+
+reference impl time 
+
+kernel execution time:  3494.78 ms
+reference time: 3497.66
+
+kernel execution time:  2383.79 ms
+reference new time: 2384.52
+
+schedule 1
+
+kernel execution time:  774.869 ms
+TTM1: 775.571
+
+kernel execution time:  1488.64 ms
+TTM2: 1489.78
+
+schedule 2
+
+kernel execution time:  1121.66 ms
+dense: 1122.11
+
+kernel execution time:  1581.94 ms
+TTM after dense: 1582.61
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  746.344 ms
+fused time: 749.212
+
+reference impl time 
+
+kernel execution time:  548.763 ms
+reference time: 549.493
+
+kernel execution time:  737.768 ms
+reference new time: 738.436
+
+schedule 1
+
+kernel execution time:  195.639 ms
+TTM1: 196.286
+
+kernel execution time:  493.569 ms
+TTM2: 494.15
+
+schedule 2
+
+kernel execution time:  0.052551 ms
+dense: 0.648739
+
+kernel execution time:  374.407 ms
+TTM after dense: 376.248
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  221.905 ms
+fused time: 222.964
+
+reference impl time 
+
+kernel execution time:  8826.57 ms
+reference time: 8827.82
+
+kernel execution time:  1435.28 ms
+reference new time: 1437.65
+
+schedule 1
+
+kernel execution time:  574.934 ms
+TTM1: 576.159
+
+kernel execution time:  4.42254 ms
+TTM2: 5.12181
+
+schedule 2
+
+kernel execution time:  1041.05 ms
+dense: 1041.36
+
+kernel execution time:  1247.06 ms
+TTM after dense: 1247.76
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1312.2 ms
+fused time: 1315.79
+
+reference impl time 
+
+kernel execution time:  3512.84 ms
+reference time: 3514.54
+
+kernel execution time:  2381.97 ms
+reference new time: 2382.6
+
+schedule 1
+
+kernel execution time:  779.205 ms
+TTM1: 779.794
+
+kernel execution time:  366.382 ms
+TTM2: 367.081
+
+schedule 2
+
+kernel execution time:  1127.72 ms
+dense: 1128.25
+
+kernel execution time:  1579.85 ms
+TTM after dense: 1580.5
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552
+C1_dimension: 25495389, C2_dimension: 32, vals: 815852448
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1326.91 ms
+fused time: 1331.56
+
+reference impl time 
+
+kernel execution time:  3535.03 ms
+reference time: 3536.38
+
+kernel execution time:  2387.24 ms
+reference new time: 2387.99
+
+schedule 1
+
+kernel execution time:  780.495 ms
+TTM1: 781.09
+
+kernel execution time:  369.704 ms
+TTM2: 370.292
+
+schedule 2
+
+kernel execution time:  1119.23 ms
+dense: 1119.7
+
+kernel execution time:  1579.78 ms
+TTM after dense: 1580.54
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 64, vals: 128
+D1_dimension: 64, D2_dimension: 128, vals: 8192
+
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  746.399 ms
+fused time: 747.454
+
+reference impl time 
+
+kernel execution time:  549.908 ms
+reference time: 550.683
+
+kernel execution time:  731.657 ms
+reference new time: 732.322
+
+schedule 1
+
+kernel execution time:  194.605 ms
+TTM1: 195.252
+
+kernel execution time:  491.591 ms
+TTM2: 492.148
+
+schedule 2
+
+kernel execution time:  0.049841 ms
+dense: 0.820181
+
+kernel execution time:  372.064 ms
+TTM after dense: 372.449
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  746.043 ms
+fused time: 747.23
+
+reference impl time 
+
+kernel execution time:  561.015 ms
+reference time: 561.669
+
+kernel execution time:  737.535 ms
+reference new time: 738.158
+
+schedule 1
+
+kernel execution time:  194.638 ms
+TTM1: 195.169
+
+kernel execution time:  495.355 ms
+TTM2: 495.903
+
+schedule 2
+
+kernel execution time:  0.148292 ms
+dense: 0.534998
+
+kernel execution time:  374.231 ms
+TTM after dense: 374.667
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  745.881 ms
+fused time: 746.992
+
+reference impl time 
+
+kernel execution time:  551.705 ms
+reference time: 552.359
+
+kernel execution time:  736.019 ms
+reference new time: 736.611
+
+schedule 1
+
+kernel execution time:  194.777 ms
+TTM1: 195.33
+
+kernel execution time:  491.151 ms
+TTM2: 491.732
+
+schedule 2
+
+kernel execution time:  0.144522 ms
+dense: 0.528597
+
+kernel execution time:  374.363 ms
+TTM after dense: 374.752
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+
+ttm-ttm execution
+
+----------------------------------------- Europa
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  2299.49 ms
+fused time: 2301.59
+
+reference impl time 
+
+kernel execution time:  78844.2 ms
+reference time: 78846.6
+
+kernel execution time:  34427 ms
+reference new time: 34429.3
+
+schedule 1
+
+kernel execution time:  6968.36 ms
+TTM1: 6970.4
+
+kernel execution time:  121.497 ms
+TTM2: 123.127
+
+schedule 2
+
+kernel execution time:  64026.1 ms
+dense: 64028
+
+kernel execution time:  15531.3 ms
+TTM after dense: 15533.4
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  40017.6 ms
+fused time: 40019.4
+
+reference impl time 
+
+kernel execution time:  50710.4 ms
+reference time: 50712.8
+
+kernel execution time:  37978.8 ms
+reference new time: 37980.6
+
+schedule 1
+
+kernel execution time:  3848.85 ms
+TTM1: 3850.48
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  40277.5 ms
+fused time: 40279.9
+
+reference impl time 
+
+kernel execution time:  50449.4 ms
+reference time: 50452
+
+kernel execution time:  37881.2 ms
+reference new time: 37883.4
+
+schedule 1
+
+kernel execution time:  3987.96 ms
+TTM1: 3990.09
+
+kernel execution time:  40935.3 ms
+TTM2: 40937.4
+
+schedule 2
+
+kernel execution time:  0.098195 ms
+dense: 1.2874
+
+kernel execution time:  12037.9 ms
+TTM after dense: 12039.5
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  36918.5 ms
+fused time: 36920.9
+
+reference impl time 
+
+kernel execution time:  47892.3 ms
+reference time: 47894.8
+
+kernel execution time:  37901.4 ms
+reference new time: 37903.5
+
+schedule 1
+
+kernel execution time:  3801.16 ms
+TTM1: 3803.21
+
+kernel execution time:  43488.6 ms
+TTM2: 43490.6
+
+schedule 2
+
+kernel execution time:  0.060642 ms
+dense: 1.08588
+
+kernel execution time:  15190.9 ms
+TTM after dense: 15192.3
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  35130 ms
+fused time: 35133.9
+
+reference impl time 
+
+kernel execution time:  47634.1 ms
+reference time: 47636.7
+
+kernel execution time:  37616.7 ms
+reference new time: 37618.9
+
+schedule 1
+
+kernel execution time:  2930.06 ms
+TTM1: 2931.74
+
+kernel execution time:  40710.7 ms
+TTM2: 40713
+
+schedule 2
+
+kernel execution time:  0.07506 ms
+dense: 1.28501
+
+kernel execution time:  12393.3 ms
+TTM after dense: 12394.9
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  12528.5 ms
+fused time: 12529.7
+
+reference impl time 
+
+kernel execution time:  23576.9 ms
+reference time: 23578.1
+
+kernel execution time:  16282.8 ms
+reference new time: 16283.8
+
+schedule 1
+
+kernel execution time:  1332.64 ms
+TTM1: 1333.18
+
+kernel execution time:  17503.1 ms
+TTM2: 17504.2
+
+schedule 2
+
+kernel execution time:  0.025131 ms
+dense: 0.438566
+
+kernel execution time:  3369.58 ms
+TTM after dense: 3370.48
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  12698.5 ms
+fused time: 12699.7
+
+reference impl time 
+
+kernel execution time:  23669.6 ms
+reference time: 23670.8
+
+kernel execution time:  16390.1 ms
+reference new time: 16391.1
+
+schedule 1
+
+kernel execution time:  1343.9 ms
+TTM1: 1344.42
+
+kernel execution time:  17641.6 ms
+TTM2: 17642.6
+
+schedule 2
+
+kernel execution time:  0.02212 ms
+dense: 0.397656
+
+kernel execution time:  3411.14 ms
+TTM after dense: 3412.04
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  844.466 ms
+fused time: 845.618
+
+reference impl time 
+
+kernel execution time:  814.964 ms
+reference time: 815.676
+
+kernel execution time:  918.472 ms
+reference new time: 919.142
+
+schedule 1
+
+kernel execution time:  200.521 ms
+TTM1: 201.112
+
+kernel execution time:  678.038 ms
+TTM2: 678.647
+
+schedule 2
+
+kernel execution time:  0.07066 ms
+dense: 0.524547
+
+kernel execution time:  394.81 ms
+TTM after dense: 395.266
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  2900.7 ms
+fused time: 2903.25
+
+reference impl time 
+
+kernel execution time:  2746.32 ms
+reference time: 2748.86
+
+kernel execution time:  2812.87 ms
+reference new time: 2815.19
+
+schedule 1
+
+kernel execution time:  2429.09 ms
+TTM1: 2431.17
+
+kernel execution time:  2451.88 ms
+TTM2: 2454.06
+
+schedule 2
+
+kernel execution time:  1.43373 ms
+dense: 2.85191
+
+kernel execution time:  1651.7 ms
+TTM after dense: 1652.91
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3539.09 ms
+fused time: 3541.54
+
+reference impl time 
+
+kernel execution time:  2968.95 ms
+reference time: 2972.61
+
+kernel execution time:  3354.98 ms
+reference new time: 3357.43
+
+schedule 1
+
+kernel execution time:  2697.68 ms
+TTM1: 2699.71
+
+kernel execution time:  2804.11 ms
+TTM2: 2806.99
+
+schedule 2
+
+kernel execution time:  6.38211 ms
+dense: 8.06652
+
+kernel execution time:  1822.02 ms
+TTM after dense: 1823.06
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3608.92 ms
+fused time: 3611.17
+
+reference impl time 
+
+kernel execution time:  3026.81 ms
+reference time: 3029.09
+
+kernel execution time:  3189.34 ms
+reference new time: 3192.69
+
+schedule 1
+
+kernel execution time:  2659.86 ms
+TTM1: 2661.48
+
+kernel execution time:  2749.47 ms
+TTM2: 2750.96
+
+schedule 2
+
+kernel execution time:  5.54375 ms
+dense: 6.71077
+
+kernel execution time:  1799.52 ms
+TTM after dense: 1800.4
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3553.08 ms
+fused time: 3555.93
+
+reference impl time 
+
+kernel execution time:  2962.14 ms
+reference time: 2964.25
+
+kernel execution time:  3306.95 ms
+reference new time: 3309.38
+
+schedule 1
+
+kernel execution time:  2723.22 ms
+TTM1: 2724.83
+
+kernel execution time:  2581.33 ms
+TTM2: 2583.4
+
+schedule 2
+
+kernel execution time:  0.772961 ms
+dense: 2.02166
+
+kernel execution time:  1731.42 ms
+TTM after dense: 1732.48
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3577.13 ms
+fused time: 3580.97
+
+reference impl time 
+
+kernel execution time:  3010.77 ms
+reference time: 3013.04
+
+kernel execution time:  3364.45 ms
+reference new time: 3366.58
+
+schedule 1
+
+kernel execution time:  2740.85 ms
+TTM1: 2742.84
+
+kernel execution time:  2788.11 ms
+TTM2: 2790.79
+
+schedule 2
+
+kernel execution time:  2.57712 ms
+dense: 4.23057
+
+kernel execution time:  1934.52 ms
+TTM after dense: 1935.9
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3424.23 ms
+fused time: 3426.81
+
+reference impl time 
+
+kernel execution time:  3023.35 ms
+reference time: 3025.97
+
+kernel execution time:  3086.35 ms
+reference new time: 3089.41
+
+schedule 1
+
+kernel execution time:  2913.43 ms
+TTM1: 2915.13
+
+kernel execution time:  2623.7 ms
+TTM2: 2625.65
+
+schedule 2
+
+kernel execution time:  5.28416 ms
+dense: 6.61329
+
+kernel execution time:  1971.48 ms
+TTM after dense: 1972.7
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3693.12 ms
+fused time: 3695.79
+
+reference impl time 
+
+kernel execution time:  2900.73 ms
+reference time: 2902.96
+
+kernel execution time:  3138.83 ms
+reference new time: 3141.16
+
+schedule 1
+
+kernel execution time:  2673.94 ms
+TTM1: 2675.57
+
+kernel execution time:  2703.37 ms
+TTM2: 2705.31
+
+schedule 2
+
+kernel execution time:  5.31585 ms
+dense: 7.12051
+
+kernel execution time:  1724.31 ms
+TTM after dense: 1726.36
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3572.56 ms
+fused time: 3575.03
+
+reference impl time 
+
+kernel execution time:  2939.46 ms
+reference time: 2941.84
+
+kernel execution time:  3182.38 ms
+reference new time: 3184.81
+
+schedule 1
+
+kernel execution time:  2731.33 ms
+TTM1: 2733.2
+
+kernel execution time:  2782.07 ms
+TTM2: 2784.32
+
+schedule 2
+
+kernel execution time:  5.52055 ms
+dense: 7.06503
+
+kernel execution time:  1729.87 ms
+TTM after dense: 1730.87
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1404.79 ms
+fused time: 1406.83
+
+reference impl time 
+
+kernel execution time:  28471.3 ms
+reference time: 28474.9
+
+kernel execution time:  5689.54 ms
+reference new time: 5692.1
+
+schedule 1
+
+kernel execution time:  3526.34 ms
+TTM1: 3528.66
+
+kernel execution time:  21.5542 ms
+TTM2: 23.6182
+
+schedule 2
+
+kernel execution time:  6069.99 ms
+dense: 6071.91
+
+kernel execution time:  6163.35 ms
+TTM after dense: 6165.73
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307
+C1_dimension: 23776223, C2_dimension: 32, vals: 760839136
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  1390.55 ms
+fused time: 1392.48
+
+reference impl time 
+
+kernel execution time:  30840.6 ms
+reference time: 30843.4
+
+kernel execution time:  5638.37 ms
+reference new time: 5641.01
+
+schedule 1
+
+kernel execution time:  3642.19 ms
+TTM1: 3644.13
+
+kernel execution time:  24.3447 ms
+TTM2: 25.6449
+
+schedule 2
+
+kernel execution time:  6027.41 ms
+dense: 6029.82
+
+kernel execution time:  6494.21 ms
+TTM after dense: 6497.33
+
+ttm-ttm execution
+
+-----------------------------------------
+
+file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+----------------------------------------------------------------
+/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns
+A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)
+B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854
+C1_dimension: 2, C2_dimension: 32, vals: 64
+D1_dimension: 32, D2_dimension: 64, vals: 2048
+
+
+kernel execution time:  3727.32 ms
+fused time: 3729.78
+
+reference impl time 
+
+kernel execution time:  2996.48 ms
+reference time: 2999.42
+
+kernel execution time:  3216.53 ms
+reference new time: 3218.79
+
+schedule 1
+
+kernel execution time:  2902.94 ms
+TTM1: 2904.86
+
+kernel execution time:  2722.22 ms
+TTM2: 2724.59
+
+schedule 2
+
+kernel execution time:  5.8157 ms
+dense: 7.48208
+
+kernel execution time:  1725.24 ms
+TTM after dense: 1726.69
diff --git a/test/test.cpp b/test/test.cpp
index a49f10ff7..851493b7f 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) {
   ASSERT_TRUE(equals(expected, actual));
 }
 
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) {
+//   std::cout << "order: " << expected.getOrder();
+//   std::vector<int> modes{};
+//   for (int mode = 0; mode < expected.getOrder(); mode++) {
+//     if (expected.getDimension(mode) != actual.getDimension(mode)) {
+//       ASSERT_TRUE(false);
+//     }
+
+//     for (int i=0; i<expected.getDimension(mode); i++) {
+//       std::cout << expected(i) << " " << actual(i) << std::endl;
+//     }
+//   }
+// }
+
 std::string testDirectory() {
   return TO_STRING(TACO_TEST_DIR);
 }
diff --git a/test/test.h b/test/test.h
index 3302bf81f..1c8f5172e 100644
--- a/test/test.h
+++ b/test/test.h
@@ -61,6 +61,7 @@ void ASSERT_VECTOR_EQ(std::vector<T> expected,
 
 void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual);
 void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual);
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual);
 
 template <typename T>
 void ASSERT_COMPONENTS_EQUALS(vector<vector<vector<int>>> expectedIndices,
diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp
index e2a972430..123bea3e6 100644
--- a/test/tests-indexstmt.cpp
+++ b/test/tests-indexstmt.cpp
@@ -1,10 +1,13 @@
+#include "taco/index_notation/kernel.h"
+#include "taco/type.h"
 #include "test.h"
 #include "test_tensors.h"
 #include "taco/tensor.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/transformations.h"
 
 using namespace taco;
-const IndexVar i("i"), j("j"), k("k");
+const IndexVar i("i"), j("j"), k("k"), l("l"), m("m");
 
 TEST(indexstmt, assignment) {
   Type t(type<double>(), {3});
@@ -84,4 +87,193 @@ TEST(indexstmt, spmm) {
 }
 
 
+TEST(indexstmt, sddmm) {
+  Type t(type<double>(), {3,3});
+  TensorVar A("A", t, {Sparse, Dense});
+  TensorVar B("B", t, {Sparse, Dense});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar w("w", Type(type<double>(),{3}), Dense);
+
+  // the below expression is the concrete index notation
+  // where (consumer, producer)
+  IndexStmt spmm = forall(i,
+                     forall(k,
+                            where(forall(j, A(i,j) = w(j)),
+                                  forall(j,   w(j) += B(i,k)*C(k,j))
+                                  )
+                            )
+                     );
+
+  // after adding scheduling transformations to this concrete-topologically sorted index stmt
+  //
+
+  std::cout << spmm << std::endl;
+  spmm = reorderLoopsTopologically(spmm);
+  std::cout << "topologically reordered loops statement: " << spmm << std::endl;
+
+  Kernel kernel = compile(spmm);
+  kernel.compute();
+}
+
+TEST(indexstmt, sddmmPlusSpmm) {
+
+  // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l);
+  // indexstmt order i, j, k, l
+  //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces)
+
+  Type t(type<double>(), {3,3});
+  TensorVar Y("Y", t, {Dense, Dense});
+  TensorVar B("B", t, {Dense, Sparse});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar D("D", t, {Dense, Dense});
+  TensorVar E("E", t, {Dense, Dense});
+
+  // TensorVar A("A", Type(type<double>(),{3}), );
+  TensorVar A("A", Type());
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+
+  IndexStmt fused2 =
+  forall(i,
+    forall(j,
+      where(
+        forall(l, Y(i,l) += A * E(j,l)), // consumer
+        forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+} 
+
+
+
+TEST(indexstmt, mttkrpPlusSpmm) {
+
+  // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1
+
+  // i = 11, k = 5, l = 7, j = 8;
+  long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6;
+
+  Type atype(type<double>(), {idim, mdim});
+  Type btype(type<double>(), {idim, kdim, ldim});
+  Type ctype(type<double>(), {kdim, jdim});
+  Type dtype(type<double>(), {ldim, jdim});
+  Type etype(type<double>(), {jdim, mdim});
+
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Sparse, Sparse, Sparse});
+  TensorVar C("C", ctype, {Dense, Dense});
+  TensorVar D("D", dtype, {Dense, Dense});
+  TensorVar E("E", etype, {Dense, Dense});
+
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(k,
+      forall(l,
+        forall(j,
+          forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m))
+        )
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+  IndexStmt fused2 =
+  forall(i,
+    where(
+      forall(j,
+        forall(m, 
+          A(i,m) += ws(j) * E(j,m)
+        )
+      )
+      ,
+      forall(k,
+        forall(l,
+          forall(j, 
+            ws(j) += B(i,k,l) * C(k,j) * D(l,j)
+          )
+        )
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+}
+
+// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0
+TEST(indexstmt, mmPlusSpmv) {
+
+  //
+
+  long unsigned int idim = 11, jdim = 8, kdim = 5;
+
+  Type ytype(type<double>(), {idim});
+  Type atype(type<double>(), {idim, jdim});
+  Type btype(type<double>(), {jdim, kdim});
+  Type vtype(type<double>(), {kdim});
+
+  TensorVar y("y", ytype, {Dense});
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Dense, Dense});
+  TensorVar v("v", vtype, {Dense});
+  
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(m, y(i) += A(i,j) * B(j,k) * v(k))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1); 
+  
+  IndexStmt fused2 =
+  where(
+    forall(i,
+      forall(j, 
+        y(i) += A(i,j) * ws(j)
+      )
+    )
+    ,
+    forall(j,
+      forall(k,
+        ws(j) += B(j,k) * v(k)
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+}
+
 
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 52bd74ab4..29a7e512e 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -1,42 +1,8 @@
-#include <taco/index_notation/transformations.h>
-#include <codegen/codegen_c.h>
-#include <codegen/codegen_cuda.h>
-#include <fstream>
-#include "test.h"
-#include "test_tensors.h"
-#include "taco/tensor.h"
-#include "taco/index_notation/index_notation.h"
-#include "taco/index_notation/transformations.h"
-#include "codegen/codegen.h"
-#include "taco/lower/lower.h"
-
-using namespace taco;
+#include "util.h"
+
 const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n");
 int WARP_SIZE = 32;
 
-void printToCout(IndexStmt stmt) {
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute", false, true);
-  codegen->compile(compute, true);
-}
-
-void printToFile(string filename, IndexStmt stmt) {
-  stringstream source;
-
-  string file_path = "eval_generated/";
-  mkdir(file_path.c_str(), 0777);
-
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute",  false, true);
-  codegen->compile(compute, true);
-
-  ofstream source_file;
-  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
-  source_file.open(file_path + filename + file_ending);
-  source_file << source.str();
-  source_file.close();
-}
-
 IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
   IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -44,6 +10,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
           .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  // return stmt;
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .reorder({i0, i1, j})
+          .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -54,6 +28,80 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, i
           .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces);
 }
 
+IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(j, jpos, A(i,j))
+          // .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({j, k})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(j, jpos, A(i,j))
+          // .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({j, k})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
 IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) {
   Assignment assign = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
                           .as<Forall>().getStmt().as<Assignment>();
@@ -107,6 +155,68 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
 }
 
+IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+  // return stmt.split(i, i0, i1, CHUNK_SIZE)
+  //         .pos(k, kpos, B(i,k))
+  //         .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+  //         .reorder({i0, i1, kpos0, j, kpos1});
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, B(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, B(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(k, kpos, B(i,k))
+          // .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          // .reorder({i0, i1, kpos0, j, kpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
 IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2");
   return stmt.fuse(i, j, f)
@@ -116,6 +226,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
           .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
+  IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2");
+  // return stmt;
+  return stmt.fuse(i, j, f)
+          .pos(f, fpos, B(i,j,k))
+          .split(fpos, chunk, fpos2, CHUNK_SIZE)
+          .reorder({chunk, fpos2, k})
+          .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) {
   TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
                          .as<Forall>().getStmt().as<Assignment>().getLhs()
@@ -125,6 +245,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) {
                           OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) {
+  TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                         .as<Forall>().getStmt().as<Assignment>().getLhs()
+                         .getTensorVar();
+  return stmt.assemble(result, AssembleStrategy::Insert);
+}
+
+IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) {
+  TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                         .as<Forall>().getStmt().as<Assignment>().getLhs()
+                         .getTensorVar();
+  return stmt.assemble(result, AssembleStrategy::Insert)
+             .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
+IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) {
+  return stmt;
+}
+
 IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2");
   return stmt.fuse(i, j, f)
@@ -149,12 +288,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  IndexExpr precomputeExpr = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w);
+          // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics
+          // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
+IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  IndexExpr precomputeExpr = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w)
+          .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
+  return stmt.split(i, i1, i2, CHUNK_SIZE);
+}
+
+IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
+  return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -162,6 +336,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, m, j});
+}
+
+IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, m, j})
+          .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -576,6 +763,92 @@ TEST(scheduling_eval, spmmCPU) {
   ASSERT_TENSOR_EQ(expected, C);
 }
 
+TEST(scheduling_eval, spmmISPC) {
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 128;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> B("B", {NUM_J, NUM_K}, {Dense, Dense});
+  Tensor<double> C("C", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  B.pack();
+
+  set_ISPC_codegen_enabled(true);
+  C(i, k) = A(i, j) * B(j, k);
+
+  IndexStmt stmt = C.getAssignment().concretize();
+  // stmt = scheduleSpMMISPC1(stmt, A);
+  // stmt = scheduleSpMMISPC1_2(stmt, A);
+  stmt = scheduleSpMMISPC1_3(stmt, A);
+  
+  // stmt = scheduleSpMMISPC2(stmt, A);
+  // stmt = scheduleSpMMISPC2_2(stmt, A);
+  
+  // stmt = scheduleSpMMISPC3(stmt, A);
+  // stmt = scheduleSpMMISPC3_2(stmt, A);
+
+  //printToFile("spmm_cpu", stmt);
+
+  C.compile(stmt);
+  C.assemble();
+  C.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i, k) = A(i, j) * B(j, k);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleSpMMCPU(stmt_taco, A);
+
+  expected.compile(stmt_taco);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, C);
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+
+  for (int i=0; i<10; i++) {
+    TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  }
+}
+
 struct spgemm : public TestWithParam<std::tuple<Format,Format,bool>> {};
 
 TEST_P(spgemm, scheduling_eval) {
@@ -805,7 +1078,7 @@ TEST(scheduling_eval, sddmmCPU) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleSDDMMCPU(stmt, B);
 
-  //printToFile("sddmm_cpu", stmt);
+  printToFile("sddmm_cpu_ryan2", stmt);
 
   A.compile(stmt);
   A.assemble();
@@ -819,55 +1092,69 @@ TEST(scheduling_eval, sddmmCPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(scheduling_eval, spmvCPU) {
-  if (should_use_CUDA_codegen()) {
+TEST(scheduling_eval, sddmmSPMMFusedCPU) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
     return;
   }
+
   int NUM_I = 1021/10;
   int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
   float SPARSITY = .3;
-  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
-  Tensor<double> x("x", {NUM_J}, Format({Dense}));
-  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+  Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
 
-  srand(120);
+  srand(268238);
   for (int i = 0; i < NUM_I; i++) {
     for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
       float rand_float = (float)rand()/(float)(RAND_MAX);
       if (rand_float < SPARSITY) {
-        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
       }
     }
   }
 
   for (int j = 0; j < NUM_J; j++) {
-    float rand_float = (float)rand()/(float)(RAND_MAX);
-    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
   }
 
-  x.pack();
-  A.pack();
+  B.pack();
+  C.pack();
+  D.pack();
 
-  y(i) = A(i, j) * x(j);
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
 
-  IndexStmt stmt = y.getAssignment().concretize();
-  stmt = scheduleSpMVCPU(stmt);
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCPU(stmt, B);
 
-  //printToFile("spmv_cpu", stmt);
+  printToFile("sddmm_cpu_ryan2", stmt);
 
-  y.compile(stmt);
-  y.assemble();
-  y.compute();
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
 
-  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
-  expected(i) = A(i, j) * x(j);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
   expected.compile();
   expected.assemble();
   expected.compute();
-  ASSERT_TENSOR_EQ(expected, y);
+  ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(scheduling_eval, ttvCPU) {
+
+TEST(scheduling_eval, sddmmcsrCPU) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -875,7 +1162,495 @@ TEST(scheduling_eval, ttvCPU) {
   int NUM_J = 1039/10;
   int NUM_K = 1057/10;
   float SPARSITY = .3;
-  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> A("A", {NUM_I, NUM_K}, CSR);
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCSRCPU(stmt, B);
+
+  printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, CSR);
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  
+  IndexStmt stmt_ref = expected.getAssignment().concretize();
+  printToFile("sddmm_cpu_ref", stmt_ref);
+
+  expected.compile(stmt_ref);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+TEST(scheduling_eval, sddmm2CPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1021/10;
+  int NUM_K = 18;
+  float SPARSITY = .3;
+  Tensor<double> Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> X("X", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int i = 0; i < NUM_J; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  X.pack();
+
+  Y(i,j) = A(i,j) * X(i,k) * X(k,j);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  // // stmt = scheduleSDDMMCPU(stmt, A);
+
+  // printToFile("sddmm2_cpu", stmt);
+
+  // A.compile(stmt);
+  // A.assemble();
+  // A.compute();
+
+  // Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  // expected(i,j) = A(i,j) * X(i,k) * X(j,k);
+  // expected.compile();
+  // expected.assemble();
+  // expected.compute();
+  // ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+
+// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC
+TEST(scheduling_eval, sddmmISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMISPC(stmt, B);
+
+  //printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  // A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  IndexStmt stmt_taco = A.getAssignment().concretize();
+  stmt_taco = scheduleSDDMMCPU(stmt_taco, B);
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compute();
+
+  TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+
+  ASSERT_TENSOR_EQ(expected, A);
+
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+  std::cout << "test scheduling_eval.sddmmISPC passed\n";
+
+}
+
+
+// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC
+TEST(scheduling_eval, sddmm2ISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+
+  int NUM_I = 1021/10;
+  int NUM_K = 1039/10;
+  int NUM_J = 1021/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_J}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,j) = B(i,j) * C(i,k) * C(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMM2ISPC(stmt, B);
+
+  //printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  // A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j) * C(i,k) * C(j,k);
+  IndexStmt stmt_taco = A.getAssignment().concretize();
+  stmt_taco = scheduleSDDMM2CPU(stmt_taco, B);
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compute();
+
+  TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+
+  ASSERT_TENSOR_EQ(expected, A);
+
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+  std::cout << "test scheduling_eval.sddmmISPC passed\n";
+
+}
+
+
+TEST(scheduling_eval, spmvCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, Format({Dense}));
+  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+
+  srand(120);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+  }
+
+  x.pack();
+  A.pack();
+
+  y(i) = A(i, j) * x(j);
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  stmt = scheduleSpMVCPU(stmt);
+
+  //printToFile("spmv_cpu", stmt);
+
+  y.compile(stmt);
+  y.assemble();
+  y.compute();
+
+  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  expected(i) = A(i, j) * x(j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, y);
+}
+
+
+TEST(scheduling_eval, spmvISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  
+  int NUM_I = 200021/10;
+  int NUM_J = 200039/10;
+  float SPARSITY = .2;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, Format({Dense}));
+  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+
+  srand(120);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+  }
+
+  x.pack();
+  A.pack();
+
+  set_ISPC_codegen_enabled(true);
+
+  y(i) = A(i, j) * x(j);
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  // stmt = scheduleSpMVISPC(stmt);
+
+  printToFile("spmv_cpu", stmt);
+
+  y.compile(stmt);
+  y.assemble();
+  // y.compile();
+
+  set_ISPC_codegen_enabled(false);
+
+  // Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  // expected(i) = A(i, j) * x(j);
+  // expected.compile();
+  // expected.assemble();
+  // expected.compute();
+
+
+  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  expected(i) = A(i, j) * x(j);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleSpMVCPU(stmt_taco);
+  
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compile();
+
+
+  TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  
+
+  ASSERT_TENSOR_EQ(expected, y);
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int j = 0; j < NUM_J; j++) {
+    if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) {
+      // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+    }
+    else {
+      std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n";
+      ASSERT_TRUE(false);
+    };
+  }
+
+  std::cout << "test scheduling_eval.spmvISPC passed\n";
+
+  for (int i=0; i<10; i++) {
+    TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  }
+
+
+}
+
+TEST(scheduling_eval, ttvCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, Format({Dense}));
+
+  srand(9536);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  A(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVCPU(stmt, B);
+
+  printToFile("ttv_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j,k) * c(k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+TEST(scheduling_eval, ttvISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
   Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
   Tensor<double> c("c", {NUM_K}, Format({Dense}));
 
@@ -899,25 +1674,30 @@ TEST(scheduling_eval, ttvCPU) {
   B.pack();
   c.pack();
 
+  set_ISPC_codegen_enabled(true);
   A(i,j) = B(i,j,k) * c(k);
 
   IndexStmt stmt = A.getAssignment().concretize();
-  stmt = scheduleTTVCPU(stmt, B);
+  stmt = scheduleTTVISPC(stmt, B);
 
-  //printToFile("ttv_cpu", stmt);
+  printToFile("ttv_ispc", "__ttv_ispc", stmt);
 
   A.compile(stmt);
   A.assemble();
   A.compute();
 
+  set_ISPC_codegen_enabled(false);
   Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
   expected(i,j) = B(i,j,k) * c(k);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleTTVCPU(stmt_taco, B);
   expected.compile();
   expected.assemble();
   expected.compute();
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+
 TEST(scheduling_eval, ttvCPU_CSR) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -928,7 +1708,7 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   int NUM_K = 1057/10;
   float SPARSITY = .3;
   Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Sparse});
-  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse});
   Tensor<double> c("c", {NUM_K}, Format({Dense}));
 
   srand(9536);
@@ -956,11 +1736,13 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleTTVCPUCSR(stmt);
 
+  printToFile("ttv_cpu_csr", stmt);
+
   A.compile(stmt);
   A.assemble();
   A.compute();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Sparse});
   expected(i,j) = B(i,j,k) * c(k);
   expected.compile();
   expected.assemble();
@@ -968,6 +1750,82 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+TEST(scheduling_eval, ttvISPC_CSR) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  int NUM_I = 10000;
+  int NUM_J = 1039/10;
+  int NUM_K = 128;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, Format({Dense}));
+
+  srand(9536);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVISPCCSR(stmt);
+  printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Sparse});
+  expected(i,j) = B(i,j,k) * c(k);
+  IndexStmt taco_stmt = expected.getAssignment().concretize();
+  taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt);
+  expected.compile(taco_stmt);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+
+  Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Sparse});
+  set_ISPC_codegen_enabled(true);
+  A2(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt2 = A2.getAssignment().concretize();
+
+  A2.compile(stmt2);
+  A2.assemble();
+  A2.compute();
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
+
+  
+}
+
 TEST(scheduling_eval, ttmCPU) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -1010,39 +1868,318 @@ TEST(scheduling_eval, ttmCPU) {
 
   //printToFile("ttm_cpu", stmt);
 
-  A.compile(stmt);
-  A.assemble();
-  A.compute();
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
+  expected(i,j,l) = B(i,j,k) * C(k,l);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, ttmISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/40;
+  int NUM_J = 1039/40;
+  int NUM_K = 1057/40;
+  int NUM_L = 1232/40;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_L}, {Dense, Dense});
+
+  srand(935);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int l = 0; l < NUM_L; l++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, l}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  A(i,j,l) = B(i,j,k) * C(k,l);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTMCPU(stmt, B);
+
+  //printToFile("ttm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
+  expected(i,j,l) = B(i,j,k) * C(k,l);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, mttkrpCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/20;
+  int NUM_J = 1039/20;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleMTTKRPCPU(stmt, B);
+  //printToFile("mttkrp_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, temp) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or sparse
+  // and (optionally) the order in which dimensions should be stored. The formats
+  // declared below correspond to doubly compressed sparse row (dcsr), row-major
+  // dense (rm), and column-major dense (dm).
+  Format dcsr({Sparse,Sparse});
+  Format   rm({Dense,Dense});
+  Format   cm({Dense,Dense}, {1,0});
+
+  // Load a sparse matrix from file (stored in the Matrix Market format) and
+  // store it as a doubly compressed sparse row matrix. Matrices correspond to
+  // order-2 tensors in taco. The matrix in this example can be download from:
+  // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz
+  Tensor<double> B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr);
+  // Generate a random dense matrix and store it in row-major (dense) format.
+  Tensor<double> C({B.getDimension(0), 1000}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in column-major format.
+  Tensor<double> D({1000, B.getDimension(1)}, cm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  // Declare the output matrix to be a sparse matrix with the same dimensions as
+  // input matrix B, to be also stored as a doubly compressed sparse row matrix.
+  Tensor<double> A(B.getDimensions(), dcsr);
+
+  // Define the SDDMM computation using index notation.
+  IndexVar i, j, k;
+  A(i,j) = B(i,j) * C(i,k) * D(k,j);
+
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input matrices but have not actually performed
+  // the computation yet. To do so, we must first tell taco to generate code that
+  // can be executed to compute the SDDMM operation.
+  A.compile();
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the SDDMM.
+  A.assemble();
+  A.compute();
+  // Write the output of the computation to file (stored in the Matrix Market format).
+  write("A.mtx", A);
+}
+
+TEST(scheduling_eval, mttkrpISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  int NUM_I = 10000; // 1021/20;
+  int NUM_J = 256;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  float SPARSITY = .1;
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
-  expected(i,j,l) = B(i,j,k) * C(k,l);
-  expected.compile();
-  expected.assemble();
-  expected.compute();
-  ASSERT_TENSOR_EQ(expected, A);
+  set_ISPC_codegen_enabled(true);
+
+  Tensor<double> A1("A1", {NUM_I, NUM_J}, {Dense, Dense});
+  A1(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt stmt1 = A1.getAssignment().concretize();
+  stmt1 = scheduleMTTKRPISPC(stmt1, B);
+  // printToFile("mttkrp1_cpu_ispc", stmt1);
+  A1.compile(stmt1);
+  A1.assemble();
+  A1.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense});
+  expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt taco_stmt1 = expected1.getAssignment().concretize();
+  taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B);
+  expected1.compile(taco_stmt1);
+  expected1.assemble();
+  expected1.compute();
+  ASSERT_TENSOR_EQ(expected1, A1);
+
+  set_ISPC_codegen_enabled(true);
+  Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Dense});
+  A2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt stmt2 = A1.getAssignment().concretize();
+  stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B);
+  // printToFile("mttkrp_cpu_ispc", stmt);
+  A2.compile(stmt2);
+  A2.assemble();
+  A2.compute();
+  ASSERT_TENSOR_EQ(expected1, A2);
+  
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense});
+  expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt taco_stmt2 = expected2.getAssignment().concretize();
+  taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B);
+  expected2.compile(taco_stmt2);
+  expected2.assemble();
+  expected2.compute();
+  ASSERT_TENSOR_EQ(expected1, expected2);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
 }
 
-TEST(scheduling_eval, mttkrpCPU) {
+
+TEST(scheduling_eval, mttkrp4ISPC) {
   if (should_use_CUDA_codegen()) {
     return;
   }
-  int NUM_I = 1021/20;
-  int NUM_J = 1039/20;
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  int NUM_I = 1000; // 1021/20;
+  int NUM_J = 16;
   int NUM_K = 1057/20;
   int NUM_L = 1232/20;
+  int NUM_M = 1124/20;
   float SPARSITY = .1;
-  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
-  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse});
   Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
   Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+  Tensor<double> E("E", {NUM_M, NUM_J}, {Dense, Dense});
 
   srand(549694);
   for (int i = 0; i < NUM_I; i++) {
     for (int k = 0; k < NUM_K; k++) {
       for (int l = 0; l < NUM_L; l++) {
-        float rand_float = (float) rand() / (float) (RAND_MAX);
-        if (rand_float < SPARSITY) {
-          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        for (int m = 0; m < NUM_M; m++) {
+          float rand_float = (float) rand() / (float) (RAND_MAX);
+          if (rand_float < SPARSITY) {
+            B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY)));
+          }
         }
       }
     }
@@ -1062,27 +2199,83 @@ TEST(scheduling_eval, mttkrpCPU) {
     }
   }
 
+  for (int m = 0; m < NUM_M; m++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      E.insert({m, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
   B.pack();
   C.pack();
   D.pack();
+  E.pack();
+
+  set_ISPC_codegen_enabled(true);
+  Tensor<double> A1("A1", {NUM_I, NUM_J}, {Dense, Dense});
+  A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+  IndexStmt stmt1 = A1.getAssignment().concretize();
+  stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B);
+  // printToFile("mttkrp1_cpu_ispc", stmt1);
+  A1.compile(stmt1);
+  A1.assemble();
+  A1.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense});
+  expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+  IndexStmt taco_stmt1 = expected1.getAssignment().concretize();
+  taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B);
+  expected1.compile(taco_stmt1);
+  expected1.assemble();
+  expected1.compute();
+  ASSERT_TENSOR_EQ(expected1, A1);
+
+  // set_ISPC_codegen_enabled(true);
+  // Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Dense});
+  // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  // IndexStmt stmt2 = A1.getAssignment().concretize();
+  // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B);
+  // // printToFile("mttkrp_cpu_ispc", stmt);
+  // A2.compile(stmt2);
+  // A2.assemble();
+  // A2.compute();
+  // ASSERT_TENSOR_EQ(expected1, A2);
+  
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense});
+  expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+
+  IndexExpr BE = B(i,k,l,m) * E(m,j);
+  IndexExpr BDE = BE * D(l, j);
+  expected2(i,j) = BDE * C(k,j);
+  IndexStmt taco_stmt2 = expected2.getAssignment().concretize();
+  TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense);
+  TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense);
+
+  IndexStmt precomputed_stmt = forall(i, forall(k,
+          where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)),
+            forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)),
+                forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j))))))));
+
+  // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64);
+  // expected2.compile(scheduled2);
+  // expected2.assemble();
+  // expected2.compute();
+  // ASSERT_TENSOR_EQ(expected1, expected2);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue);
+    // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue);
+    // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
+}
 
-  A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
-
-  IndexStmt stmt = A.getAssignment().concretize();
-  stmt = scheduleMTTKRPCPU(stmt, B);
-  //printToFile("mttkrp_cpu", stmt);
-
-  A.compile(stmt);
-  A.assemble();
-  A.compute();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
-  expected(i,j) = B(i,k,l) * C(k,j) * D(l,j);
-  expected.compile();
-  expected.assemble();
-  expected.compute();
-  ASSERT_TENSOR_EQ(expected, A);
-}
 
 TEST(scheduling_eval, spmvGPU) {
   if (!should_use_CUDA_codegen()) {
@@ -1463,7 +2656,336 @@ TEST(scheduling_eval, mttkrpGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(generate_evaluation_files, DISABLED_cpu) {
+TEST(generate_evaluation_files, ispc) {
+  std::cout << "Hi Adhitha!\n" << std::endl ;
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  vector<vector<int>> spmv_parameters = {{32}};
+  vector<vector<int>> spmspv_parameters = {{8}};
+
+  // 4 to 512 and 4, 8, 16
+  vector<vector<int>> spmm_dcsr_parameters = {{16, 8}};
+  vector<vector<int>> spmm_parameters = {{16,4}};
+
+  vector<vector<int>> mttkrp_parameters = {};
+  mttkrp_parameters.push_back({64,0});
+
+  vector<vector<int>> sddmm_parameters = {{8, 8}};
+  vector<vector<int>> ttv_parameters = {{32}};
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+  int NUM_K = 100;
+  int NUM_L = 100;
+
+  string c_file_ending = ".h";
+  string file_ending = ".ispc";
+  string file_path = "eval_prepared_ispc/";
+  mkdir(file_path.c_str(), 0777);
+
+  // spmv
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> x("x", {NUM_J}, {Dense});
+    Tensor<double> y("y", {NUM_I}, {Dense});
+    y(i) = A(i, j) * x(j);
+    std::cout << "concretizing the assignment statement\n";
+    IndexStmt stmt = y.getAssignment().concretize();
+    std::cout << "Printing the original IndexStmt: " << stmt << std::endl;
+
+    for (auto paramSet : spmv_parameters) {
+      std::cout << "param set: " << paramSet[0] << std::endl;
+      IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]);
+      std::cout << "scheduled IndexStmt: " << scheduled << std::endl;
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      std::cout << "computed statement: \n" << compute << std::endl;
+      codegen->compile(compute, false);
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+    
+  }
+
+  // spmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm omp
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm2
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // ttv
+  {
+    stringstream source;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+    Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+    Tensor<double> c("c", {NUM_K}, Format({Dense}));
+    A(i,j) = B(i,j,k) * c(k);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : ttv_parameters) {
+      IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "ttv_cpu" + c_file_ending);
+    source_file << source.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__ttv_cpu" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  // mttkrp3
+  {
+    stringstream source;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+    Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+    Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+    Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+    A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : mttkrp_parameters) {
+      IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "mttkrp3_cpu" + c_file_ending);
+    source_file << source.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  return;
+}
+
+
+
+TEST(generate_ispc_sddmm_evaluation_files, ispc) {
+  std::cout << "Hi Adhitha!\n" << std::endl ;
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  vector<vector<int>> spmv_parameters = {{32}};
+  vector<vector<int>> spmspv_parameters = {{8}};
+
+  // 4 to 512 and 4, 8, 16
+  vector<vector<int>> spmm_dcsr_parameters = {{16, 8}};
+  vector<vector<int>> spmm_parameters = {{16,4}};
+
+  vector<vector<int>> mttkrp_parameters = {};
+  mttkrp_parameters.push_back({64,0});
+
+  vector<vector<int>> sddmm_parameters = {{8, 8}};
+  vector<vector<int>> ttv_parameters = {{32}};
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+  int NUM_K = 100;
+
+  string c_file_ending = ".h";
+  string file_ending = ".ispc";
+  string file_path = "eval_prepared_ispc/sddmm/";
+  mkdir(file_path.c_str(), 0777);
+
+  // sddmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+    Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+    Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+    Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+    A(i,k) = B(i,k) * C(i,j) * D(j,k);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : sddmm_parameters) {
+      IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  // sddmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Tensor<double> A("A", {NUM_I, NUM_K}, CSR);
+    Tensor<double> X("X", {NUM_I, NUM_J}, {Dense, Dense});
+    Y(i,j) = A(i,j) * X(i,k) * X(j,k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : sddmm_parameters) {
+      IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  return;
+}
+
+
+
+
+TEST(generate_evaluation_files, cpu) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -1779,10 +3301,63 @@ TEST(generate_evaluation_files, DISABLED_cpu) {
   }
 }
 
-TEST(generate_evaluation_files, DISABLED_gpu) {
-  if (!should_use_CUDA_codegen()) {
-    return;
+TEST(generate_evaluation_files, spmv_ispc) {
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  std::cout << "executing generate_evaluation_file.ispc\n";
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+
+  vector<vector<int>> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE}
+  for (int i = 3; i <= 20; i++) {
+    spmv_parameters.push_back({i, 512});
+  }
+
+  string file_ending_c = ".c";
+  string file_ending_ispc = ".ispc";
+  string file_path = "eval_prepared_ispc/spmv/";
+  mkdir(file_path.c_str(), 0777);
+
+    // spmv
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> x("x", {NUM_J}, Format({Dense}));
+    Tensor<double> y("y", {NUM_I}, Format({Dense}));
+    IndexExpr precomputed = A(i, j) * x(j);
+    y(i) = precomputed;
+    IndexStmt stmt = y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmv_parameters) {
+      IndexStmt scheduled = scheduleSpMVCPU(stmt);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file1;
+    source_file1.open(file_path + "spmv_ispc" + file_ending_c);
+    source_file1 << source1.str();
+    source_file1.close();
+
+    ofstream source_file2;
+    source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc);
+    source_file2 << source2.str();
+    source_file2.close();
   }
+}
+
+TEST(generate_evaluation_files, gpu) {
+  // if (!should_use_CUDA_codegen()) {
+  //   return;
+  // }
+  set_CUDA_codegen_enabled(true);
+  set_ISPC_codegen_enabled(false);
+
+  std::cout << "executing generate_evaluation_file.gpu\n";
 
   vector<vector<int>> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE}
   for (int i = 3; i <= 20; i++) {
diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp
new file mode 100644
index 000000000..41fb86f6f
--- /dev/null
+++ b/test/tests-scheduling-fuse.cpp
@@ -0,0 +1,2891 @@
+#include "taco/cuda.h"
+#include "taco/tensor.h"
+#include "test.h"
+#include "util.h"
+#include <climits>
+#include "gtest/gtest.h"
+#include <cstdint>
+#include <papi.h>
+
+#define NUM_THREADS_TO_USE 1
+// #define NUM_THREADS_TO_USE 32
+
+void handle_error (int retval)
+{
+     printf("PAPI error %d: %s\n", retval, PAPI_strerror(retval));
+     exit(1);
+}
+
+TEST(scheduling_eval, spmvFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense});
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+  int NUM_I = 5; // 1021/10;
+  int NUM_J = 5; // 1039/10;
+  int NUM_K = 8;
+  float SPARSITY = .3;
+  Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  B.pack();
+
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C("C", {NUM_J, NUM_K}, csr);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> v("v", {NUM_K}, rm);
+  for (int i = 0; i < v.getDimension(0); ++i) {
+      v.insert({i}, unif(gen));
+  }
+  std::cout << "packing D mat\n";
+  v.pack();
+
+  Tensor<double> A("A", {NUM_I}, rm);
+  Tensor<double> ref("ref", {NUM_I}, rm);
+  IndexVar i, j, k, l, m;
+  A(i) = B(i,j) * C(j,k) * v(k);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("SpMVfused", stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt);
+
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  // ref(i) = B(i,j) * C(j,k) * v(k);
+  // IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  // refStmt = makeConcreteNotation(refStmt);
+  // refStmt = insertTemporaries(refStmt);
+  // refStmt = parallelizeOuterLoop(refStmt);
+  // ref.compile(refStmt);
+  // ref.assemble();
+
+  // Tensor<double> ref1({NUM_J}, rm);
+  // Tensor<double> ref2({NUM_I}, rm);
+  // ref1(j) = C(j,k) * v(k);
+  // ref2(i) = B(i,j) * ref1(j);
+
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  // ref1Stmt = insertTemporaries(ref1Stmt);
+  // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  // ref1.compile(ref1Stmt);
+  // ref1.assemble();
+
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+  // ASSERT_TENSOR_EQ(ref, A);
+
+  // // check results
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match A("<< q << "): " 
+  //       << A(q) << ", ref: " << ref(q) << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+  // // ASSERT_TENSOR_EQ(A, ref);
+  // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+  // ASSERT_TENSOR_EQ(ref, ref2);
+
+  // for (int q = 0; q < ref2.getDimension(0); ++q) {
+  //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+  //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+
+}
+
+TEST(scheduling_eval, spmvFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nspmv-spmv execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense});
+
+
+
+  int filenum = 1;
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+
+  int kDim = 8;
+  float SPARSITY = .3;
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> C = read(matfile, csr, true);
+  C.setName("C");
+  C.pack();
+
+
+  Tensor<double> v("v", {C.getDimension(1)}, rm);
+  for (int i = 0; i < v.getDimension(0); ++i) {
+      v.insert({i}, unif(gen));
+  }
+  std::cout << "packing D mat\n";
+  v.pack();
+
+  if (statfile.is_open()) {
+    statfile 
+      << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  Tensor<double> A("A", {B.getDimension(0)}, rm);
+  Tensor<double> ref("ref", {B.getDimension(0)}, rm);
+  IndexVar i, j, k, l, m;
+  A(i) = B(i,j) * C(j,k) * v(k);
+
+  ref(i) = B(i,j) * C(j,k) * v(k);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("SpMVfused", stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt);
+  A.compile(stmt);
+  A.assemble();
+
+
+  // Tensor<double> ref1({NUM_J}, rm);
+  // Tensor<double> ref2({NUM_I}, rm);
+  // ref1(j) = C(j,k) * v(k);
+  // ref2(i) = B(i,j) * ref1(j);
+
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  // ref1Stmt = insertTemporaries(ref1Stmt);
+  // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  // ref1.compile(ref1Stmt);
+  // ref1.assemble();
+
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+    std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so";
+
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue);
+
+  
+  std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl;
+  // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue);
+  // ASSERT_TENSOR_EQ(ref, A);
+
+  // // check results
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match A("<< q << "): " 
+  //       << A(q) << ", ref: " << ref(q) << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+  // // ASSERT_TENSOR_EQ(A, ref);
+  // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+  // ASSERT_TENSOR_EQ(ref, ref2);
+
+  // for (int q = 0; q < ref2.getDimension(0); ++q) {
+  //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+  //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+TEST(scheduling_eval, sddmmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense, dense});
+  int ldim = 4;
+  int kdim = 8;
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  float SPARSITY = .3;
+  Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B);
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l;
+  A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMMConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+
+  // check results
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+  // ASSERT_TENSOR_EQ(A, ref);
+  TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+
+  for (int q = 0; q < ref2.getDimension(0); ++q) {
+    for (int w = 0; w < ref2.getDimension(1); ++w) {
+      if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+
+}
+
+
+IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+}
+
+TEST(scheduling_eval, sddmmFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int ldim = 128;
+  int kdim = 128;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+  // int filenum = 5;
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt
+    .split(i, i0, i1, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = refStmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k, l});
+  stmt = insertTemporaries(stmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+
+  IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here
+  
+  ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+          // .pos(j, jpos, B(i,j));
+          // .split(k, k0, k1, 8);
+          // .reorder({i0, i1, jpos0, k, jpos1});
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // ref1Stmt.split(i, );
+  // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so";
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  statfile << "\nseparate execution\n";
+  
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "spmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  statfile << "\nreference execution \n";
+
+  std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+  // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+  //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+  //       << "refvals: " << ref_vals[q] << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+
+  for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+  for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref2_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+  // // for (int q= 0; q< A_vals
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   for (int w = 0; w < A.getDimension(1); ++w) {
+  //     if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+  // ASSERT_TENSOR_EQ(A, ref);
+
+  } // end of for loop
+
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+}
+
+
+
+
+TEST(scheduling_eval, hadamardFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int kdim = 128;
+  int ldim = 128;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+  // int filenum = 15;
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({kdim, ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l);
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = stmt.reorder({i, j, k, l});
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt
+    .split(i, i0, i1, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k, l});
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), kdim}, rm);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,k)=B(i,j)*C(j,k)*D(j,k);
+  ref2(i,l)=ref1(i,k)*F(k,l);
+
+  // IndexStmt ref1Stmt = ref1.getAssignment().concretize();
+  
+  // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+  //         // .pos(j, jpos, B(i,j));
+  //         // .split(k, k0, k1, 8);
+  //         // .reorder({i0, i1, jpos0, k, jpos1});
+  //         // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+  //         // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // // ref1Stmt.split(i, );
+  // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = ref1Stmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k});
+    // .pos(j, jpos, B(i,j))
+    // .split(jpos, jpos0, jpos1, 32)
+    // .split(k, k0, k1, 32)
+    // .reorder({i0, i1, jpos0, k0, jpos1, k1});
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = ref2Stmt
+    .split(i, i0, i1, 32)
+    .split(k, k0, k1, 32)
+    .split(l, l0, l1, 32)
+    .reorder({i0, k0, l0, i1, k1, l1});
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "hadamard time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue);
+  // if (statfile.is_open()) {
+  //   statfile << "sddmm time: ";
+  //   statfile << timevalue.mean << std::endl;
+  // } else { std::cout << " stat file is not open\n"; }
+  
+  // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+  // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref2_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  } // end of for loop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+
+
+
+
+
+TEST(scheduling_eval, mttkrpFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or 
+  // sparse and (optionally) the order in which dimensions should be stored. The 
+  // formats declared below correspond to compressed sparse fiber (csf) and 
+  // row-major dense (rm).
+  Format csf({Sparse,Sparse,Sparse});
+  Format rm({Dense,Dense});
+  Format sd({Dense,Dense});
+
+  int NUM_I = 1021/20;
+  int NUM_J = 1039/20;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  int NUM_M = 1231/20;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_M}, sd);
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, csf);
+  Tensor<double> C("C", {NUM_K, NUM_J}, rm);
+  Tensor<double> D("D", {NUM_L, NUM_J}, rm);
+  Tensor<double> E("E", {NUM_J, NUM_M}, rm);
+  Tensor<double> ref({NUM_I, NUM_M}, sd);
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+  C.pack();
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+  D.pack();
+
+  for (int i = 0; i < E.getDimension(0); ++i) {
+    for (int j = 0; j < E.getDimension(1); ++j) {
+      E.insert({i,j}, unif(gen));
+    }
+  }
+  E.pack();
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, k, l, j, m;
+  A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMTTKRPConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMTTKRPOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMTTKRPFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMTTKRPWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMTTKRPFusedPar", stmt);
+
+  
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input tensor and matrices but have not actually
+  // performed the computation yet. To do so, we must first tell taco to generate
+  // code that can be executed to compute the MTTKRP operation.
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();  
+
+  // Tensor<double> ref2({NUM_I, NUM_J}, sd);
+  // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble(); 
+
+  // Tensor<double> ref3({NUM_I, NUM_M}, sd);
+  // ref3(i,m) = ref2(i,j) * E(j,m);
+  // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  // ref3Stmt = makeConcreteNotation(ref3Stmt);
+  // ref3Stmt = insertTemporaries(ref3Stmt);
+  // ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  // ref3.compile(ref3Stmt);
+  // ref3.assemble();  
+  
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue);
+  ASSERT_TENSOR_EQ(ref, A);
+  // ASSERT_TENSOR_EQ(ref, ref3);
+
+}
+
+
+TEST(scheduling_eval, mttkrpFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nmttkrp-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or 
+  // sparse and (optionally) the order in which dimensions should be stored. The 
+  // formats declared below correspond to compressed sparse fiber (csf) and 
+  // row-major dense (rm).
+  Format csf({Dense,Sparse,Sparse});
+  Format rm({Dense,Dense});
+  Format sd({Dense,Dense});
+  int jDim = 32;
+  int mDim = 64;
+
+  int matfilenum = 3;
+
+  // Load a sparse order-3 tensor from file (stored in the FROSTT format) and 
+  // store it as a compressed sparse fiber tensor. The tensor in this example 
+  // can be download from: http://frostt.io/tensors/nell-2/
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", 
+    "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2
+    "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3
+    "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4
+    "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6
+    "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", //  3
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", //   4
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns",  // 6
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns"
+  };
+  std::string matfile = matfiles[matfilenum];
+  Tensor<double> B = read(matfile, csf, true);
+  // write(matfilesrw[matfilenum], B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  Tensor<double> C({B.getDimension(1), jDim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in row-major format.
+  Tensor<double> D({B.getDimension(2), jDim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  Tensor<double> E({jDim, mDim}, rm);
+  for (int i = 0; i < E.getDimension(0); ++i) {
+    for (int j = 0; j < E.getDimension(1); ++j) {
+      E.insert({i,j}, unif(gen));
+    }
+  }
+  E.pack();
+
+  if (statfile.is_open()) {
+    statfile 
+      << matfile << std::endl
+      << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+    // Declare the output matrix to be a dense matrix with 25 columns and the same
+  // number of rows as the number of slices along the first dimension of input
+  // tensor B, to be also stored as a row-major dense matrix.
+  Tensor<double> A({B.getDimension(0), mDim}, sd);
+  Tensor<double> ref({B.getDimension(0), mDim}, sd);
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, k, l, j, m;
+  IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2");
+
+  A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  // stmt = stmt.reorder({i,j,k,l,m});
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt.split(i, i1, i2, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMTTKRPFusedPar", stmt);
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt
+    .split(i, i1, i2, 16);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref2({B.getDimension(0), jDim}, sd);
+  ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = ref2Stmt
+    .split(i, i1, i2, 16);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble(); 
+
+  Tensor<double> ref2_ryan({B.getDimension(0), jDim}, sd);
+  ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+
+  IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment());
+  ref2RyanStmt = makeConcreteNotation(ref2RyanStmt);
+  
+  IndexExpr precomputeExpr = ref2RyanStmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w)
+          .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+  ref2RyanStmt = insertTemporaries(ref2RyanStmt);
+  // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt);
+  ref2_ryan.compile(ref2RyanStmt);
+  ref2_ryan.assemble(); 
+
+  Tensor<double> ref3({B.getDimension(0), mDim}, sd);
+  ref3(i,m) = ref2(i,j) * E(j,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = ref3Stmt
+    .split(i, i1, i2, 16)
+    .split(j, j1, j2, 16)
+    .split(m, m1, m2, 16)
+    .reorder({i1, j1, m1, i2, j2, m2})
+    .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble(); 
+
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "default mttkrp time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "ryan mttkrp workspace time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+  double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* jDim; q++) {
+    if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " "
+        << "refvals: " << ref2_ryan_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "GeMM time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "reference asymptotic blowup time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* mDim; q++) {
+    if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused mttkrp+gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* mDim; q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+
+}
+
+TEST(scheduling_eval, ttmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  Format csf({Sparse,Sparse,Sparse});
+  Format custom({Sparse,Sparse,Dense});
+  Format rm({Dense,Dense});
+
+  int NUM_I = 5;
+  int NUM_J = 5;
+  int NUM_K = 5;
+  int NUM_L = 64;
+  int NUM_M = 1024;
+  float SPARSITY = .1;
+
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, csf);
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  Tensor<double> C({B.getDimension(2), NUM_L}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in row-major format.
+  Tensor<double> D({NUM_L, NUM_M}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  Tensor<double> A({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  Tensor<double> ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, j, k, l, m;
+  A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedTTMTTKRPConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedTTMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedTTMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedTTMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedTTMFinal", stmt);
+
+  
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input tensor and matrices but have not actually
+  // performed the computation yet. To do so, we must first tell taco to generate
+  // code that can be executed to compute the MTTKRP operation.
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  printToFile("tacoFusedTTM", refStmt);
+  ref.compile(refStmt);
+  ref.assemble(); 
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom);
+  ref1(i,j,l) = B(i,j,k) * C(k,l);
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();  
+
+  Tensor<double> ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  ref2(i,j,m) = ref1(i,j,l) * D(l,m);
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble(); 
+
+  Tensor<double> ref3({B.getDimension(2), NUM_M}, rm);
+  ref3(k,m) = C(k,l) * D(l,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble();  
+
+  Tensor<double> ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  ref4(i,j,m) = B(i,j,k) * ref3(k,m);
+  IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment());
+  ref4Stmt = makeConcreteNotation(ref4Stmt);
+  ref4Stmt = insertTemporaries(ref4Stmt);
+  ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+  ref4.compile(ref4Stmt);
+  ref4.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue);
+  ASSERT_TENSOR_EQ(ref, A);
+  ASSERT_TENSOR_EQ(ref, ref2);
+  ASSERT_TENSOR_EQ(ref, ref4);
+
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      for (int z = 0; z < A.getDimension(2); ++z) {
+        // std::cout << "(" << q << "," << w << "," << z << ")" 
+        //   << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl;
+        if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) {
+          std::cout << "error: results don't match A: " 
+            << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl;
+          ASSERT_TRUE(false);
+        }
+      }
+    }
+  }
+
+}
+
+TEST(scheduling_eval, ttmFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  int retval, EventSet = PAPI_NULL;
+  retval = PAPI_hl_region_begin("dummy");
+  if ( retval != PAPI_OK ) handle_error(1);
+
+  retval = PAPI_hl_region_end("dummy");
+  if ( retval != PAPI_OK ) handle_error(1);
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nttm-ttm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  Format csf({Dense,Sparse,Sparse});
+  Format custom({Dense,Sparse,Dense});
+  Format rm({Dense,Dense});
+  int ldim = 32;
+  int mdim = 64;
+
+  int64_t dummy_array_size = 2e6;
+  int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t));
+
+  vector<int> matfilenums = {5};
+
+  for (auto matfilenum : matfilenums) {
+
+    // int matfilenum = 0;
+
+    
+
+    // Load a sparse order-3 tensor from file (stored in the FROSTT format) and 
+    // store it as a compressed sparse fiber tensor. The tensor in this example 
+    // can be download from: http://frostt.io/tensors/nell-2/
+    std::vector<std::string> matfiles = {
+      "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2
+      "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3
+      "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4
+      "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 
+      "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6
+      "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns"
+    };
+    std::vector<std::string> matfilesrw = {
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns"
+    };
+    statfile << "\nfile: " << matfiles[matfilenum] << std::endl;
+    statfile << "----------------------------------------------------------------\n";
+
+    std::string matfile = matfiles[matfilenum];
+    Tensor<double> B = read(matfile, csf);
+    B.setName("B");
+    B.pack();
+    // write(matfilesrw[matfilenum], B);
+
+    // Generate a random dense matrix and store it in row-major (dense) format. 
+    // Matrices correspond to order-2 tensors in taco.
+    Tensor<double> C("C", {B.getDimension(2), ldim}, rm);
+    for (int i = 0; i < C.getDimension(0); ++i) {
+      for (int j = 0; j < C.getDimension(1); ++j) {
+        C.insert({i,j}, unif(gen));
+      }
+    }
+    C.pack();
+
+    // Generate another random dense matrix and store it in row-major format.
+    Tensor<double> D("D", {ldim, mdim}, rm);
+    for (int i = 0; i < D.getDimension(0); ++i) {
+      for (int j = 0; j < D.getDimension(1); ++j) {
+        D.insert({i,j}, unif(gen));
+      }
+    }
+    D.pack();
+
+    if (statfile.is_open()) {
+      statfile 
+        << matfile << std::endl
+        << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl
+        << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+        << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+        << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+        << std::endl;
+    }
+
+    Tensor<double> A({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    Tensor<double> ref({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    Tensor<double> refn({B.getDimension(0), B.getDimension(1), mdim}, custom);
+
+    // Define the MTTKRP computation using index notation.
+    IndexVar i, j, k, l, m;
+    IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1;
+    A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+
+
+    IndexStmt stmt = makeReductionNotation(A.getAssignment());
+    stmt = makeConcreteNotation(stmt);
+    stmt = reorderLoopsTopologically(stmt);
+    stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+    stmt = stmt.split(i, i0, i1, 16);
+    stmt = insertTemporaries(stmt);
+    stmt = parallelizeOuterLoop(stmt); 
+    printToFile("fusedTTMFinal", stmt);
+
+    A.compile(stmt);
+    A.assemble();
+
+
+    ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO
+    IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+    refStmt = makeConcreteNotation(refStmt);
+    refStmt = refStmt
+      .split(i, i0, i1, 16);
+    refStmt = insertTemporaries(refStmt);
+    refStmt = parallelizeOuterLoop(refStmt);
+    printToFile("tacoFusedTTM", refStmt);
+    ref.compile(refStmt);
+    ref.assemble();
+
+    refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO
+    IndexStmt refnStmt = makeReductionNotation(refn.getAssignment());
+    refnStmt = makeConcreteNotation(refnStmt);
+    refnStmt = refnStmt
+      .split(i, i0, i1, 16)
+      .reorder({i0, i1, j, k, l, m});
+    refnStmt = insertTemporaries(refnStmt);
+    refnStmt = parallelizeOuterLoop(refnStmt);
+    printToFile("tacoFusedTTM", refnStmt);
+    refn.compile(refnStmt);
+    refn.assemble();
+
+    Tensor<double> ref1({B.getDimension(0), B.getDimension(1), ldim}, custom);
+    ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1
+    IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+    ref1Stmt = makeConcreteNotation(ref1Stmt);
+    // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+    ref1Stmt = insertTemporaries(ref1Stmt);
+    ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+    ref1.compile(ref1Stmt);
+    ref1.assemble();  
+
+    Tensor<double> ref2({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2
+    IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+    ref2Stmt = makeConcreteNotation(ref2Stmt);
+    // ref2Stmt = ref2Stmt.split(i, i0, i1, 16);
+    ref2Stmt = insertTemporaries(ref2Stmt);
+    ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+    ref2.compile(ref2Stmt);
+    ref2.assemble();
+
+    Tensor<double> ref3({B.getDimension(2), mdim}, rm);
+    ref3(k,m) = C(k,l) * D(l,m); // GeMM
+    IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+    ref3Stmt = makeConcreteNotation(ref3Stmt);
+    ref3Stmt = ref3Stmt
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .split(m, m0, m1, 32)
+      .reorder({k0, l0, m0, k1, l1, m1});
+    ref3Stmt = insertTemporaries(ref3Stmt);
+    ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+    ref3.compile(ref3Stmt);
+    ref3.assemble();  
+
+    Tensor<double> ref4({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1
+    IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment());
+    ref4Stmt = makeConcreteNotation(ref4Stmt);
+    // ref4Stmt = ref4Stmt
+    //   .split(i, i0, i1, 16);
+    //   // .split(k, k0, k1, 16)
+    //   .split(m, m0, m1, 16)
+    //   .reorder({i0, i1, j, m0, k, m1});
+    ref4Stmt = insertTemporaries(ref4Stmt);
+    ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+    ref4.compile(ref4Stmt);
+    ref4.assemble();
+
+    std::cout << "compute start\n";
+    taco::util::TimeResults timevalue;
+    bool time                = true;
+
+    int r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+    std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.so";
+    retval = PAPI_hl_region_begin("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(A.compute(statfile, sofile_fused), "\n\nFused TTM->TTM: ", timevalue);
+    retval = PAPI_hl_region_end("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "fused time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    statfile << "\nreference impl time \n";
+
+    std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.so";
+    retval = PAPI_hl_region_begin("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference TTM->TTM: ", timevalue);
+    retval = PAPI_hl_region_end("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "reference time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_original2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.so";
+    retval = PAPI_hl_region_begin("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(refn.compute(statfile, sofile_original2), "\n\nReference new TTM->TTM: ", timevalue);
+    retval = PAPI_hl_region_end("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "reference new time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    statfile << "\nschedule 1\n";
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm11 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.so";
+    retval = PAPI_hl_region_begin("ttm1_1"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_ttm11), "\n\nTTM1: ", timevalue);
+    retval = PAPI_hl_region_end("ttm1_1"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "TTM1: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.so";
+    retval = PAPI_hl_region_begin("ttm2"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_ttm2), "\n\nTTM2: ", timevalue);
+    retval = PAPI_hl_region_end("ttm2"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "TTM2: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    statfile << "\nschedule 2\n";
+
+    retval = PAPI_hl_region_begin("gemm"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\ndense: ", timevalue);
+    retval = PAPI_hl_region_end("gemm"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "dense: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm12 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.so";
+    retval = PAPI_hl_region_begin("ttm1_2"); if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_ttm12), "\n\nTTM after dense: ", timevalue);
+    retval = PAPI_hl_region_end("ttm1_2"); if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "TTM after dense: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    bool istrue = false;
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      if (dummy_array_to_flush_cache[i] != r) {
+        istrue = true;
+      }
+    }
+    std::cout << "istrue: " << istrue << std::endl;
+
+
+    double* A_vals = (double*) (A.getTacoTensorT()->vals);
+    double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+    double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+    double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals);
+
+    // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+    // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+    // std::cout << "our fused vs taco original fused check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+    // std::cout << "taco original fused vs TTM1, TTM2 check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " "
+    //       << "refvals: " << ref2_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+    // std::cout << "taco original fused vs GeMM, TTM1 check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " "
+    //       << "refvals: " << ref4_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+  } // end of forloop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+
+
+
+TEST(scheduling_eval, spmmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense, dense});
+  int ldim = 32;
+  int kdim = 64;
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+  int NUM_I = 128;
+  int NUM_J = 96;
+  int NUM_K = 64;
+  float SPARSITY = .3;
+  Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  B.pack();
+
+  Tensor<double> C("C", {NUM_J, NUM_K}, csr);
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  C.pack();
+  // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B);
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> D({C.getDimension(1), ldim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  D.pack();
+
+  // Tensor<double> E({B.getDimension(1), kdim}, rm);
+  // for (int i = 0; i < D.getDimension(0); ++i) {
+  //   for (int j = 0; j < D.getDimension(1); ++j) {
+  //     D.insert({i,j}, unif(gen));
+  //   }
+  // }
+  // std::cout << "packing D mat\n";
+  // D.pack();
+
+  // Tensor<double> F({B.getDimension(1), ldim}, rm);
+  // for (int i = 0; i < F.getDimension(0); ++i) {
+  //   for (int j = 0; j < F.getDimension(1); ++j) {
+  //     F.insert({i,j}, unif(gen));
+  //   }
+  // }
+  // std::cout << "packing F mat\n";
+  // F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l;
+  A(i,l)=B(i,j)*C(j,k)*D(k,l);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMMConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(j,k)*D(k,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  // Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  // Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  // ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  // ref2(i,l)=ref1(i,j)*F(j,l);
+
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  // ref1Stmt = insertTemporaries(ref1Stmt);
+  // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  // ref1.compile(ref1Stmt);
+  // ref1.assemble();
+
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+
+  // check results
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+  // // ASSERT_TENSOR_EQ(A, ref);
+  // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+
+  // for (int q = 0; q < ref2.getDimension(0); ++q) {
+  //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+  //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+
+}
+
+
+TEST(scheduling_eval, spmmFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  // int retval, EventSet = PAPI_NULL;
+  // retval = PAPI_hl_region_begin("dummy");
+  // if ( retval != PAPI_OK ) handle_error(1);
+
+  /* Do some computation */
+
+  // retval = PAPI_hl_region_end("dummy");
+  // if ( retval != PAPI_OK ) handle_error(1);
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nspmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int kdim = 128;
+  int ldim = 64;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+
+    statfile << "filenum: " << filenum << std::endl;
+    statfile << "---------------------------------\n";
+    // int filenum = 7;
+
+    std::vector<std::string> matfiles = {
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+      "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx",
+    };
+    std::vector<std::string> matfilesrw = {
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+    };
+
+    std::string matfile = matfiles[filenum];
+    std::cout << "reading B mat mtx\n";
+    Tensor<double> B = read(matfile, csr);
+    B.pack();
+    // write(matfilesrw[filenum], B);
+
+    if (statfile.is_open()) {
+      statfile << matfile << std::endl;
+    }
+
+    std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+    std::cout << "adding c mat\n";
+    // Tensor<double> C = read(matfiles2[filenum], csr, true);
+    // std::cout << "packing C mat\n";
+
+    std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+    std::cout << "adding c mat\n";
+    Tensor<double> C("C", {B.getDimension(1), kdim}, rm);
+    for (int i = 0; i < C.getDimension(0); ++i) {
+      for (int j = 0; j < C.getDimension(1); ++j) {
+        C.insert({i,j}, unif(gen));
+      }
+    }
+    std::cout << "packing C mat\n";
+    C.pack();
+
+    Tensor<double> D({C.getDimension(1), ldim}, rm);
+    for (int i = 0; i < D.getDimension(0); ++i) {
+      for (int j = 0; j < D.getDimension(1); ++j) {
+        D.insert({i,j}, unif(gen));
+      }
+    }
+    std::cout << "packing D mat\n";
+    D.pack();
+
+    // Tensor<double> F({B.getDimension(1), ldim}, rm);
+    // for (int i = 0; i < F.getDimension(0); ++i) {
+    //   for (int j = 0; j < F.getDimension(1); ++j) {
+    //     F.insert({i,j}, unif(gen));
+    //   }
+    // }
+    // std::cout << "packing F mat\n";
+    // F.pack();
+
+    Tensor<double> A({B.getDimension(0), ldim}, rm);
+    Tensor<double> ref({B.getDimension(0), ldim}, rm);
+    Tensor<double> refn({B.getDimension(0), ldim}, rm);
+    IndexVar i, j, k, l;
+    IndexVar i0, i1, j0, j1, k0, k1, l0, l1;
+
+    A(i,l)=B(i,j)*C(j,k)*D(k,l);
+    if (statfile.is_open()) {
+      statfile 
+        << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl
+        << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+        << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+        << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+        // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+        << std::endl;
+    }
+
+    // IndexStmt stmt = A.getAssignment().concretize();
+    IndexStmt stmt = makeReductionNotation(A.getAssignment());
+    stmt = makeConcreteNotation(stmt);
+    stmt = reorderLoopsTopologically(stmt);
+    stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+    stmt = stmt.split(i, i0, i1, 16);
+    stmt = insertTemporaries(stmt);
+    stmt = parallelizeOuterLoop(stmt);
+
+    A.compile(stmt);
+    A.assemble();
+
+
+    ref(i,l)=B(i,j)*C(j,k)*D(k,l);
+    refn(i,l)=B(i,j)*C(j,k)*D(k,l);
+    // IndexStmt refStmt = ref.getAssignment().concretize();
+
+    // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+            // .pos(j, jpos, B(i,j));
+            // .split(k, k0, k1, 8);
+            // .reorder({i0, i1, jpos0, k, jpos1});
+            // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+            // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+    IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+    refStmt = makeConcreteNotation(refStmt);
+    refStmt = refStmt
+      .split(i, i0, i1, 16)
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({i0, i1, j, k0, l0, k1, l1});
+    refStmt = insertTemporaries(refStmt);
+    refStmt = parallelizeOuterLoop(refStmt);
+    ref.compile(refStmt);
+    ref.assemble();
+
+    IndexStmt refnStmt = makeReductionNotation(refn.getAssignment());
+    refnStmt = makeConcreteNotation(refnStmt);
+    refnStmt = refnStmt
+      .split(i, i0, i1, 16);
+    refnStmt = insertTemporaries(refnStmt);
+    refnStmt = parallelizeOuterLoop(refnStmt);
+    refn.compile(refnStmt);
+    refn.assemble();
+
+    // SpMM , GEMM
+
+    Tensor<double> ref1({B.getDimension(0), kdim}, rm);
+    Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+    Tensor<double> ref2_2({B.getDimension(0), ldim}, rm);
+    
+    ref1(i,k)=B(i,j)*C(j,k);
+    ref2(i,l)=ref1(i,k)*D(k,l);
+    ref2_2(i,l)=ref1(i,k)*D(k,l);
+
+    IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+    ref1Stmt = makeConcreteNotation(ref1Stmt);
+    ref1Stmt = insertTemporaries(ref1Stmt);
+    ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+    ref1.compile(ref1Stmt);
+    ref1.assemble();
+
+    IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+    ref2Stmt = makeConcreteNotation(ref2Stmt);
+    ref2Stmt = insertTemporaries(ref2Stmt);
+    ref2Stmt = ref2Stmt.split(i, i0, i1, 16);
+    ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+    ref2.compile(ref2Stmt);
+    ref2.assemble();
+
+    IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment());
+    ref2Stmt2 = makeConcreteNotation(ref2Stmt2);
+    ref2Stmt2 = ref2Stmt2
+      .split(i, i0, i1, 32)
+      .split(k,k0,k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({i0, k0, l0, i1, k1, l1})
+      .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+    ref2Stmt2 = insertTemporaries(ref2Stmt2);
+    // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2);
+    ref2_2.compile(ref2Stmt2);
+    ref2_2.assemble();
+
+
+    // -------------- GeMM and SpMM 
+
+    Tensor<double> ref3({C.getDimension(0), ldim}, rm);
+    Tensor<double> ref4({C.getDimension(0), ldim}, rm);
+    ref3(j,l)=C(j,k)*D(k,l); // GEMM
+    ref4(i,l) = B(i,j)*ref3(j,l); // SpMM
+
+    IndexStmt ref3Stmt = ref3.getAssignment().concretize();
+    ref3Stmt = ref3Stmt
+      .split(j, j0, j1, 32) // changed to 32
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({j0, k0, l0, j1, k1, l1})
+      .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+    ref2Stmt2 = insertTemporaries(ref2Stmt2);
+    ref3.compile(ref3Stmt);
+    ref3.assemble();
+    
+    IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation
+    ref4Stmt = makeConcreteNotation(ref4Stmt);
+    ref4Stmt = ref4Stmt.split(i, i0, i1, 16);
+    ref4Stmt = insertTemporaries(ref4Stmt);
+    ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+    ref4.compile(ref4Stmt);
+    ref4.assemble();
+
+
+    std::cout << "compute start\n";
+    taco::util::TimeResults timevalue;
+    bool time                = true;
+
+    statfile << "\n--------- 1st pattern computation TTM, GEMM\n";
+    
+    // retval = PAPI_hl_region_begin("spmm");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("spmm");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "SpMM time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+    // retval = PAPI_hl_region_begin("spmmtemplate");
+    // if ( retval != PAPI_OK ) handle_error(1);   
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("spmmtemplate");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "SpMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+    
+    // retval = PAPI_hl_region_begin("gemm");
+    // if ( retval != PAPI_OK ) handle_error(1); 
+    TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("gemm");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "GeMM time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    // retval = PAPI_hl_region_begin("gemmtemplate");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("gemmtemplate");
+    // if ( retval != PAPI_OK ) handle_error(1);    
+    if (statfile.is_open()) {
+      statfile << "ref 2 GeMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so";
+    statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n";
+    // retval = PAPI_hl_region_begin("gemmtemplate2");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("gemmtemplate2");
+    // if ( retval != PAPI_OK ) handle_error(1);  
+    if (statfile.is_open()) {
+      statfile << "ref3 GeMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    // retval = PAPI_hl_region_begin("spmm2");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue);
+    // retval = PAPI_hl_region_end("spmm2");
+    // if ( retval != PAPI_OK ) handle_error(1);  
+    if (statfile.is_open()) {
+      statfile << "SpMM template time ref4: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    statfile << "\n-------- reference pattern computation\n";
+
+    // retval = PAPI_hl_region_begin("ref");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("ref");
+    // if ( retval != PAPI_OK ) handle_error(1);     
+    if (statfile.is_open()) {
+      statfile << "taco reference time: ";
+      statfile << timevalue << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    // retval = PAPI_hl_region_begin("refnew");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("refnew");
+    // if ( retval != PAPI_OK ) handle_error(1);     
+    if (statfile.is_open()) {
+      statfile << "taco reference new time: ";
+      statfile << timevalue << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    // retval = PAPI_hl_region_begin("sparselnr");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+    // retval = PAPI_hl_region_end("sparselnr");
+    // if ( retval != PAPI_OK ) handle_error(1);
+    if (statfile.is_open()) {
+      statfile << "fused time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    double* A_vals = (double*) (A.getTacoTensorT()->vals);
+    double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+    double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+    double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+    // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+    // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref2_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref4_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+
+  } // end of file num for loop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+  
+  // unsigned int native = 0x0;
+
+  // retval = PAPI_library_init(PAPI_VER_CURRENT);
+
+  // if (retval != PAPI_VER_CURRENT) {
+  //   printf("PAPI library init error!\n");
+  //   exit(1);
+  // } else {
+  //   printf("PAPI library init success\n");
+  // }
+
+  // if (PAPI_create_eventset(&EventSet) != PAPI_OK) {
+  //   handle_error(1);
+  // }
+
+  // /* Add the native event */
+  // native = ()
+
+    // retval = PAPI_hl_region_begin("computation1");
+    // if ( retval != PAPI_OK )
+    //     handle_error(1);
+
+    // /* Do some computation */
+
+    // retval = PAPI_hl_region_end("computation1");
+    // if ( retval != PAPI_OK )
+    //     handle_error(1);
+
+    // retval = PAPI_hl_region_begin("computation2");
+    // if ( retval != PAPI_OK )
+    //     handle_error(1);
+
+    // /* Do some computation */
+
+    // retval = PAPI_hl_region_end("computation2");
+    // if ( retval != PAPI_OK )
+    //     handle_error(1);
+}
+
+
+
+
+
+
+TEST(scheduling_eval, sddmmspmmFused) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm-gemm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+
+  int kdim = 64;
+  int ldim = 64;
+  int mdim = 64;
+
+  // vector<int> filenums{2, 3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums{0};
+
+  for (auto filenum : filenums) {
+
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> G({ldim, mdim}, rm);
+  for (int i = 0; i < G.getDimension(0); ++i) {
+    for (int j = 0; j < G.getDimension(1); ++j) {
+      G.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  G.pack();
+
+  Tensor<double> A({B.getDimension(0), mdim}, rm);
+  Tensor<double> ref({B.getDimension(0), mdim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1");
+  
+  A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+  
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2);
+  stmt = stmt.split(i, i0, i1, 16);
+
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("sddmmSpMMGeMM", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt.split(i, i0, i1, 16);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref3({B.getDimension(0), mdim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+  ref3(i,m)=ref2(i,l)*G(l,m);
+
+  IndexStmt ref1Stmt = ref1.getAssignment().concretize();
+  
+  ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+  //         // .pos(j, jpos, B(i,j));
+  //         // .split(k, k0, k1, 8);
+  //         // .reorder({i0, i1, jpos0, k, jpos1});
+  //         // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+  //         // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // // ref1Stmt.split(i, );
+  // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  // ref3(i,m)=ref2(i,l)*G(l,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = ref3Stmt
+    .split(i, i0, i1, 32)
+    .split(l, l0, l1, 32)
+    .split(m, m0, m1, 32)
+    .reorder({i0, l0, m0, i1, l1, m1});
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so";
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm ryan time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "spmm ryan time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals);
+
+  // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+
+
+  }
+
+  // int filenum = 3;
+
+  
+  // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+  //   if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+  //       << "refvals: " << ref3_vals[q] << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+  // for (int q= 0; q< A_vals
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   for (int w = 0; w < A.getDimension(1); ++w) {
+  //     if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+  // ASSERT_TENSOR_EQ(A, ref);
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
\ No newline at end of file
diff --git a/test/tests-scheduling-ispc-eval.cpp b/test/tests-scheduling-ispc-eval.cpp
new file mode 100644
index 000000000..139597f9c
--- /dev/null
+++ b/test/tests-scheduling-ispc-eval.cpp
@@ -0,0 +1,2 @@
+
+
diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp
index abfec3d45..9a472906f 100644
--- a/test/tests-transformation.cpp
+++ b/test/tests-transformation.cpp
@@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply,
 
 struct reorderLoopsTopologically : public TestWithParam<NotationTest> {};
 
+
+//
 TEST_P(reorderLoopsTopologically, test) {
   IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual);
   ASSERT_NOTATION_EQ(GetParam().expected, actual);
diff --git a/test/util.h b/test/util.h
new file mode 100644
index 000000000..f96087ba1
--- /dev/null
+++ b/test/util.h
@@ -0,0 +1,113 @@
+#ifndef __SCHEDULE_UTIL_HH__
+#define __SCHEDULE_UTIL_HH__
+
+#include <iostream>
+#include <taco/index_notation/transformations.h>
+#include <codegen/codegen_c.h>
+#include <codegen/codegen_ispc.h>
+#include <codegen/codegen_cuda.h>
+#include <fstream>
+#include <memory>
+#include <random>
+#include "taco/cuda.h"
+#include "test.h"
+#include "test_tensors.h"
+#include "taco/tensor.h"
+#include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/transformations.h"
+#include "codegen/codegen.h"
+#include "taco/lower/lower.h"
+#include "taco/util/timers.h"
+
+using namespace taco;
+
+#define ERROR_MARGIN (1.0e-2)
+
+#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) {                  \
+    if (time) {                                                  \
+      taco::util::Timer timer;                                   \
+      timer.start();                                             \
+      CODE;                                                      \
+      timer.stop();                                              \
+      taco::util::TimeResults result = timer.getResult();        \
+      cout << NAME << " " << result << " ms" << endl;            \
+      TIMER=result;                                              \
+    }                                                            \
+    else {                                                       \
+      CODE;                                                      \
+    }                                                            \
+}
+
+#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) {                  \
+    if (time) {                                                  \
+      taco::util::Timer timer;                                   \
+      timer.start();                                             \
+      CODE;                                                      \
+      timer.stop();                                              \
+      taco::util::TimeResults result = timer.getResult();        \
+      if (statfile.is_open()) {                                  \
+        statfile << NAME << " " << result << " ms" << endl;      \
+      } else {                                                   \
+        cout << NAME << " " << result << " ms" << endl;          \
+      }                                                          \
+      TIMER=result;                                              \
+    }                                                            \
+    else {                                                       \
+      CODE;                                                      \
+    }                                                            \
+}
+
+static void printToCout(IndexStmt stmt);
+static void printToFile(string filename, IndexStmt stmt);
+static void printToFile(string filename, string additional_filename, IndexStmt stmt);
+
+
+static void printToCout(IndexStmt stmt) {
+  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+  ir::Stmt compute = lower(stmt, "compute", false, true);
+  codegen->compile(compute, true);
+}
+
+void printToFile(string filename, IndexStmt stmt) {
+  stringstream source;
+
+  string file_path = "eval_generated/";
+  mkdir(file_path.c_str(), 0777);
+
+  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
+  ir::Stmt compute = lower(stmt, "compute",  false, true);
+  codegen->compile(compute, true);
+
+  ofstream source_file;
+  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
+  source_file.open(file_path + filename + file_ending);
+  source_file << source.str();
+  source_file.close();
+}
+
+void printToFile(string filename, string additional_filename, IndexStmt stmt) {
+  stringstream source1;
+  stringstream source2;
+
+  string file_path = "eval_generated/";
+  mkdir(file_path.c_str(), 0777);
+
+  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+  ir::Stmt compute = lower(stmt, "compute", false, true);
+  codegen->compile(compute, true);
+
+  ofstream source_file;
+  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
+  source_file.open(file_path+filename+file_ending);
+  source_file << source1.str();
+  source_file.close();
+
+  ofstream additional_source_file;
+  string additional_file_ending = ".ispc";
+  additional_source_file.open(file_path+additional_filename+additional_file_ending);
+  additional_source_file << source2.str();
+  additional_source_file.close();
+
+}
+
+#endif // __SCHEDULE_UTIL_HH__
\ No newline at end of file
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 922f7e52e..41699d3fd 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES})
   get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE)
   add_executable("${TOOL}-tool" ${TOOL_SOURCE})
   target_link_libraries("${TOOL}-tool" taco)
+  target_link_libraries("${TOOL}-tool" papi)
   target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include")
   SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL})
   install(TARGETS "${TOOL}-tool" DESTINATION bin)
diff --git a/tools/taco.cpp b/tools/taco.cpp
index cd351a203..7384874ec 100644
--- a/tools/taco.cpp
+++ b/tools/taco.cpp
@@ -9,6 +9,7 @@
 #include "taco.h"
 
 #include "taco/error.h"
+#include "taco/index_notation/index_notation.h"
 #include "taco/parser/lexer.h"
 #include "taco/parser/parser.h"
 #include "taco/parser/schedule_parser.h"
@@ -20,6 +21,7 @@
 #include "taco/lower/lower.h"
 #include "taco/codegen/module.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "codegen/codegen.h"
 #include "taco/util/strings.h"
@@ -188,6 +190,8 @@ static void printUsageInfo() {
   cout << endl;
   printFlag("print-nocolor", "Print without colors.");
   cout << endl;
+  printFlag("ispc", "Generate ISPC code for Intel CPUs");
+  cout << endl;
   printFlag("cuda", "Generate CUDA code for NVIDIA GPUs");
   cout << endl;
   printFlag("schedule", "Specify parallel execution schedule");
@@ -262,7 +266,7 @@ static void printSchedulingHelp() {
               "an output race strategy `strat`. Since the other transformations "
               "expect serial code, parallelize must come last in a series of "
               "transformations.  Possible parallel hardware units are: "
-              "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. "
+              "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. "
               "Possible output race strategies are: "
               "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction.");
 }
@@ -279,6 +283,8 @@ static void printVersionInfo() {
     cout << "Built with Python support." << endl;
   if(TACO_FEATURE_CUDA)
     cout << "Built with CUDA support." << endl;
+  if(TACO_FEATURE_ISPC)
+    cout << "Built with ISPC support." << endl;
   cout << endl;
   cout << "Built on: " << TACO_BUILD_DATE << endl;
   cout << "CMake build type: " << TACO_BUILD_TYPE << endl;
@@ -308,7 +314,10 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) {
   }
 }
 
-static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) {
+static int setSchedulingCommands(vector<vector<string>> scheduleCommands, 
+  parser::Parser& parser, IndexStmt& stmt, Assignment assignment) {
+
+  std::cout << "setting scheduling commands\n";
   auto findVar = [&stmt](string name) {
     ProvenanceGraph graph(stmt);
     for (auto v : graph.getAllIndexVars()) {
@@ -321,9 +330,15 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
     abort(); // to silence a warning: control reaches end of non-void function
   };
 
-  bool isGPU = false;
+  int isGPU = 0;
+  int isISPC = 0;
 
   for(vector<string> scheduleCommand : scheduleCommands) {
+    std::cout << "running schedluing command: ";
+    for (auto &command : scheduleCommand) {
+      std::cout << command << " ";
+    }
+    std::cout << std::endl;
     string command = scheduleCommand[0];
     scheduleCommand.erase(scheduleCommand.begin());
 
@@ -352,6 +367,16 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
       IndexVar fused(f);
       stmt = stmt.fuse(findVar(i), findVar(j), fused);
 
+    } else if (command == "loopfuse") {
+      taco_uassert(scheduleCommand.size() == 2) 
+        << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)";
+      std::string side = scheduleCommand[0];
+      taco_uassert(side == "b" || side == "f") 
+        << "first parameter must be either 'f' or 'b'";
+
+      int iters = std::stoi(scheduleCommand[1]);
+
+      stmt = loopFusionOverFission(stmt, assignment, side, iters);
     } else if (command == "split") {
       taco_uassert(scheduleCommand.size() == 4)
           << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)";
@@ -536,7 +561,15 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
         parallel_unit = ParallelUnit::CPUThread;
       } else if (unit == "CPUVector") {
         parallel_unit = ParallelUnit::CPUVector;
-      } else {
+      } else if (unit == "CPUSimd") {
+        isISPC = true;
+        parallel_unit = ParallelUnit::CPUSimd;
+      } 
+      else if (unit == "CPUSpmd") {
+        parallel_unit = ParallelUnit::CPUSpmd;
+        isISPC = true;
+      }
+      else {
         taco_uerror << "Parallel hardware not defined.";
         goto end;
       }
@@ -557,6 +590,8 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
         goto end;
       }
 
+      std::cout << "stmt before parallelizing the statement: " << stmt << endl;
+      std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl;
       stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy);
 
     } else if (command == "assemble") {
@@ -612,7 +647,13 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
     end:;
   }
 
-  return isGPU;
+  if (isGPU) {
+    return 1;
+  }
+  else if (isISPC) {
+    return 2;
+  }
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
@@ -641,6 +682,7 @@ int main(int argc, char* argv[]) {
   bool color               = true;
   bool readKernels         = false;
   bool cuda                = false;
+  bool ispc                = false;
 
   bool setSchedule         = false;
 
@@ -949,6 +991,10 @@ int main(int argc, char* argv[]) {
     else if ("-cuda" == argName) {
       cuda = true;
     }
+    else if ("-ispc" == argName) {
+      std::cout << "ispc true\n";
+      ispc = true;
+    }
     else if ("-schedule" == argName) {
       vector<string> descriptor = util::split(argValue, ",");
       if (descriptor.size() > 2 || descriptor.empty()) {
@@ -1001,6 +1047,8 @@ int main(int argc, char* argv[]) {
     }
   }
 
+  std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl;
+
   // Print compute is the default if nothing else was asked for
   if (!printAssemble && !printEvaluate && !printIterationGraph &&
       !writeCompute && !writeAssemble && !writeKernels && !readKernels &&
@@ -1009,9 +1057,11 @@ int main(int argc, char* argv[]) {
   }
 
   // pre-parse expression, to determine existence and order of loaded tensors
+  std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n";
   map<string,TensorBase> loadedTensors;
   TensorBase temp_tensor;
   parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42);
+  std::cout << exprStr << std::endl;
   try {
     temp_parser.parse();
     temp_tensor = temp_parser.getResultTensor();
@@ -1112,33 +1162,61 @@ int main(int argc, char* argv[]) {
   taco_set_parallel_schedule(sched, chunkSize);
   taco_set_num_threads(nthreads);
 
-  IndexStmt stmt =
-      makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  Assignment assignment = tensor.getAssignment();
+  std::cout << "tensor.getAssignment(): " << assignment << std::endl;
+
+  IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment());
+  std::cout << "reducedNotation: " << stmt2 << std::endl;
+  // IndexStmt stmt = 
+  //     makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  IndexStmt stmt = makeConcreteNotation(stmt2);
+  std::cout << "concrete index statement: " << stmt << std::endl;
   stmt = reorderLoopsTopologically(stmt);
 
+  std::cout << "topologically reordered loops statement: " << stmt << std::endl;
+
   if (setSchedule) {
-    cuda |= setSchedulingCommands(scheduleCommands, parser, stmt);
+    int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment());
+    // stmt = loopFusionOverFission(stmt, tensor.getAssignment());
+    cuda |= (val==1);
+    ispc |= (val==2);
   }
   else {
+    // stmt = loopFusionOverFission(stmt, tensor.getAssignment());
     stmt = insertTemporaries(stmt);
     stmt = parallelizeOuterLoop(stmt);
   }
+  std::cout << "after setting the scheduling commands\n";
+  std::cout << stmt << std::endl;
 
   if (cuda) {
     if (!CUDA_BUILT && benchmark) {
       return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2);
     }
     set_CUDA_codegen_enabled(true);
+    set_ISPC_codegen_enabled(false);
+  }
+  else if (ispc) {
+    if (!ISPC_BUILT && benchmark) {
+      return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2);
+    }
+    set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(true);
   }
   else {
     set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(false);
   }
 
+  std::cout << "running scalar promote\n" << std::endl; //
   stmt = scalarPromote(stmt);
+  std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl;
+
   if (printConcrete) {
     cout << stmt << endl;
   }
 
+  // lower index statement to ir statement
   Kernel kernel;
   if (benchmark) {
     if (time) cout << endl;
@@ -1221,9 +1299,15 @@ int main(int argc, char* argv[]) {
     }
   }
   else {
+    std::cout << "lowering stmt: " << stmt << std::endl;
     compute = lower(stmt, prefix+"compute",  computeWithAssemble, true);
     assemble = lower(stmt, prefix+"assemble", true, false);
     evaluate = lower(stmt, prefix+"evaluate", true, true);
+
+    std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl;
+    // compute kernel is the most basic kernel after lowering phase
+
+    std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl;
   }
 
   string packComment =
@@ -1278,6 +1362,7 @@ int main(int argc, char* argv[]) {
   }
 
   bool hasPrinted = false;
+
   std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
   codegen->setColor(color);
   if (printAssemble) {
@@ -1298,6 +1383,7 @@ int main(int argc, char* argv[]) {
     }
 
     if (compute.defined()) {
+      std::cout << "Code generation\n";
       codegen->compile(compute, false);
     }
     else {
@@ -1355,7 +1441,7 @@ int main(int argc, char* argv[]) {
   }
 
   IterationGraph iterationGraph;
-  if (printIterationGraph) {
+  if (printIterationGraph) { // print iteration graph
     iterationGraph = IterationGraph::make(tensor.getAssignment());
   }