diff --git a/.gitignore b/.gitignore
index 16389f34e..215b56e9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@ CMakeCache.txt
 doc
 
 apps/tensor_times_vector/tensor_times_vector
+
+.cache
+.vscode
+compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6a80d9d1..4f8b54eee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,10 +11,10 @@ project(taco
 )
 option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
 option(PYTHON "Build TACO for python environment" OFF)
-option(OPENMP "Build with OpenMP execution support" OFF)
+option(OPENMP "Build with OpenMP execution support" ON)
 option(COVERAGE "Build with code coverage analysis" OFF)
 set(TACO_FEATURE_CUDA 0)
-set(TACO_FEATURE_OPENMP 0)
+set(TACO_FEATURE_OPENMP 1)
 set(TACO_FEATURE_PYTHON 0)
 if(CUDA)
   message("-- Searching for CUDA Installation")
diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 36eb34f1a..44431ef46 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -17,7 +17,7 @@ class Module {
 public:
   /// Create a module for some target
   Module(Target target=getTargetFromEnvironment())
-    : lib_handle(nullptr), moduleFromUserSource(false), target(target) {
+    : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) {
     setJITLibname();
     setJITTmpdir();
   }
@@ -44,11 +44,16 @@ class Module {
   /// before calling. If there's no function of this name then a nullptr is
   /// returned.
   void* getFuncPtr(std::string name);
+  void* getFuncPtr(std::string& sofile, std::string name);
 
   /// Call a raw function in this module and return the result
+  int callFuncPackedRaw(std::string name, std::string& sofile, void** args);
   int callFuncPackedRaw(std::string name, void** args);
   
   /// Call a raw function in this module and return the result
+  int callFuncPackedRaw(std::string name, std::string& sofile, std::vector<void*> args) {
+    return callFuncPackedRaw(name, sofile, args.data());
+  }
   int callFuncPackedRaw(std::string name, std::vector<void*> args) {
     return callFuncPackedRaw(name, args.data());
   }
@@ -57,6 +62,10 @@ class Module {
   int callFuncPacked(std::string name, void** args) {
     return callFuncPackedRaw("_shim_"+name, args);
   }
+
+  int callFuncPacked(std::string name, std::string& sofile, void** args) {
+    return callFuncPackedRaw("_shim_"+name, sofile,args);
+  }
   
   /// Call a function using the taco_tensor_t interface and return the result
   int callFuncPacked(std::string name, std::vector<void*> args) {
@@ -72,6 +81,7 @@ class Module {
   std::string libname;
   std::string tmpdir;
   void* lib_handle;
+  void* so_lib_handle;
   std::vector<Stmt> funcs;
   
   // true iff the module was created from user-provided source
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index 7aa2579ad..4d6ec6830 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt);
  */
 IndexStmt reorderLoopsTopologically(IndexStmt stmt);
 
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment,
+  std::string side, int iters);
+
 /**
  * Performs scalar promotion so that reductions are done by accumulating into 
  * scalar temporaries whenever possible.
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index b91782256..883718fb6 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -413,6 +413,8 @@ class TensorBase {
 
   /// Compile the tensor expression.
   void compile();
+  void compute(std::ofstream& statfile);
+  void compute(std::ofstream& statfile, std::string& sofile);
 
   void compile(IndexStmt stmt, bool assembleWhileCompute=false);
 
diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg
new file mode 100755
index 000000000..e88dbd51b
Binary files /dev/null and b/out/taco-uml/._taco.svg differ
diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg
new file mode 100644
index 000000000..57f7a18d1
--- /dev/null
+++ b/out/taco-uml/taco.svg
@@ -0,0 +1,878 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="1823px" preserveAspectRatio="none" style="width:3568px;height:1823px;background:#FFFFFF;" version="1.1" viewBox="0 0 3568 1823" width="3568px" zoomAndPan="magnify"><defs><filter height="300%" id="fujoep6dbpit" width="300%" x="-1" y="-1"><feGaussianBlur result="blurOut" stdDeviation="2.0"/><feColorMatrix in="blurOut" result="blurOut2" type="matrix" values="0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 .4 0"/><feOffset dx="4.0" dy="4.0" in="blurOut2" result="blurOut3"/><feBlend in="SourceGraphic" in2="blurOut3" mode="normal"/></filter></defs><g><!--MD5=[d414847e5e8717ca0c3531bdd138c8ba]
+class IntrusivePtr--><rect codeLine="4" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IntrusivePtr" style="stroke:#A80036;stroke-width:1.5;" width="103" x="632" y="7"/><ellipse cx="647" cy="23" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M649.9688,28.6406 Q649.3906,28.9375 648.75,29.0781 Q648.1094,29.2344 647.4063,29.2344 Q644.9063,29.2344 643.5781,27.5938 Q642.2656,25.9375 642.2656,22.8125 Q642.2656,19.6875 643.5781,18.0313 Q644.9063,16.375 647.4063,16.375 Q648.1094,16.375 648.75,16.5313 Q649.4063,16.6875 649.9688,16.9844 L649.9688,19.7031 Q649.3438,19.125 648.75,18.8594 Q648.1563,18.5781 647.5313,18.5781 Q646.1875,18.5781 645.5,19.6563 Q644.8125,20.7188 644.8125,22.8125 Q644.8125,24.9063 645.5,25.9844 Q646.1875,27.0469 647.5313,27.0469 Q648.1563,27.0469 648.75,26.7813 Q649.3438,26.5 649.9688,25.9219 L649.9688,28.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="661" y="27.1543">IntrusivePtr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="39" y2="39"/><ellipse cx="643" cy="50" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="33" x="652" y="53.2104">T *ptr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="59.8047" y2="59.8047"/><!--MD5=[9fb058d7a838b7ba6ed26398a5e03f68]
+class Uncopyable--><rect codeLine="7" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Uncopyable" style="stroke:#A80036;stroke-width:1.5;" width="105" x="786" y="244"/><ellipse cx="801" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M803.9688,265.6406 Q803.3906,265.9375 802.75,266.0781 Q802.1094,266.2344 801.4063,266.2344 Q798.9063,266.2344 797.5781,264.5938 Q796.2656,262.9375 796.2656,259.8125 Q796.2656,256.6875 797.5781,255.0313 Q798.9063,253.375 801.4063,253.375 Q802.1094,253.375 802.75,253.5313 Q803.4063,253.6875 803.9688,253.9844 L803.9688,256.7031 Q803.3438,256.125 802.75,255.8594 Q802.1563,255.5781 801.5313,255.5781 Q800.1875,255.5781 799.5,256.6563 Q798.8125,257.7188 798.8125,259.8125 Q798.8125,261.9063 799.5,262.9844 Q800.1875,264.0469 801.5313,264.0469 Q802.1563,264.0469 802.75,263.7813 Q803.3438,263.5 803.9688,262.9219 L803.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="73" x="815" y="264.1543">Uncopyable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="284" y2="284"/><!--MD5=[f38687c19e1720eba4a1ab1343a37015]
+class IRNode--><rect codeLine="9" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="73.6094" id="IRNode" style="stroke:#A80036;stroke-width:1.5;" width="288" x="197.5" y="548"/><ellipse cx="315.25" cy="564" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M318.2188,569.6406 Q317.6406,569.9375 317,570.0781 Q316.3594,570.2344 315.6563,570.2344 Q313.1563,570.2344 311.8281,568.5938 Q310.5156,566.9375 310.5156,563.8125 Q310.5156,560.6875 311.8281,559.0313 Q313.1563,557.375 315.6563,557.375 Q316.3594,557.375 317,557.5313 Q317.6563,557.6875 318.2188,557.9844 L318.2188,560.7031 Q317.5938,560.125 317,559.8594 Q316.4063,559.5781 315.7813,559.5781 Q314.4375,559.5781 313.75,560.6563 Q313.0625,561.7188 313.0625,563.8125 Q313.0625,565.9063 313.75,566.9844 Q314.4375,568.0469 315.7813,568.0469 Q316.4063,568.0469 317,567.7813 Q317.5938,567.5 318.2188,566.9219 L318.2188,569.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="44" x="335.75" y="568.1543">IRNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="580" y2="580"/><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="588" y2="588"/><ellipse cx="208.5" cy="599" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="217.5" y="602.2104">virtual void accept(IRVisitorStrict *v) const = 0</text><ellipse cx="208.5" cy="611.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="217.5" y="615.0151">virtual IRNodeType type_info() const = 0;</text><!--MD5=[bc9d8c255d7fbd519a9f6a6cf76a7a1b]
+class BaseStmtNode--><rect codeLine="14" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="BaseStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="125" x="110" y="830"/><ellipse cx="125" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M127.9688,851.6406 Q127.3906,851.9375 126.75,852.0781 Q126.1094,852.2344 125.4063,852.2344 Q122.9063,852.2344 121.5781,850.5938 Q120.2656,848.9375 120.2656,845.8125 Q120.2656,842.6875 121.5781,841.0313 Q122.9063,839.375 125.4063,839.375 Q126.1094,839.375 126.75,839.5313 Q127.4063,839.6875 127.9688,839.9844 L127.9688,842.7031 Q127.3438,842.125 126.75,841.8594 Q126.1563,841.5781 125.5313,841.5781 Q124.1875,841.5781 123.5,842.6563 Q122.8125,843.7188 122.8125,845.8125 Q122.8125,847.9063 123.5,848.9844 Q124.1875,850.0469 125.5313,850.0469 Q126.1563,850.0469 126.75,849.7813 Q127.3438,849.5 127.9688,848.9219 L127.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="139" y="850.1543">BaseStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="870" y2="870"/><!--MD5=[27b83928eb4ae87e2fc2e82e735e02cd]
+class BaseExprNode--><rect codeLine="15" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="BaseExprNode" style="stroke:#A80036;stroke-width:1.5;" width="123" x="315" y="823.5"/><ellipse cx="330" cy="839.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M332.9688,845.1406 Q332.3906,845.4375 331.75,845.5781 Q331.1094,845.7344 330.4063,845.7344 Q327.9063,845.7344 326.5781,844.0938 Q325.2656,842.4375 325.2656,839.3125 Q325.2656,836.1875 326.5781,834.5313 Q327.9063,832.875 330.4063,832.875 Q331.1094,832.875 331.75,833.0313 Q332.4063,833.1875 332.9688,833.4844 L332.9688,836.2031 Q332.3438,835.625 331.75,835.3594 Q331.1563,835.0781 330.5313,835.0781 Q329.1875,835.0781 328.5,836.1563 Q327.8125,837.2188 327.8125,839.3125 Q327.8125,841.4063 328.5,842.4844 Q329.1875,843.5469 330.5313,843.5469 Q331.1563,843.5469 331.75,843.2813 Q332.3438,843 332.9688,842.4219 L332.9688,845.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="91" x="344" y="843.6543">BaseExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="855.5" y2="855.5"/><ellipse cx="326" cy="866.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="78" x="335" y="869.7104">Datatype type</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="876.3047" y2="876.3047"/><!--MD5=[d94a4fdce57fa90edc62507e0f6859c0]
+class StmtNode--><rect codeLine="19" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="StmtNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="15" y="1198"/><ellipse cx="92.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M95.2188,1219.6406 Q94.6406,1219.9375 94,1220.0781 Q93.3594,1220.2344 92.6563,1220.2344 Q90.1563,1220.2344 88.8281,1218.5938 Q87.5156,1216.9375 87.5156,1213.8125 Q87.5156,1210.6875 88.8281,1209.0313 Q90.1563,1207.375 92.6563,1207.375 Q93.3594,1207.375 94,1207.5313 Q94.6563,1207.6875 95.2188,1207.9844 L95.2188,1210.7031 Q94.5938,1210.125 94,1209.8594 Q93.4063,1209.5781 92.7813,1209.5781 Q91.4375,1209.5781 90.75,1210.6563 Q90.0625,1211.7188 90.0625,1213.8125 Q90.0625,1215.9063 90.75,1216.9844 Q91.4375,1218.0469 92.7813,1218.0469 Q93.4063,1218.0469 94,1217.7813 Q94.5938,1217.5 95.2188,1216.9219 L95.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="112.75" y="1218.1543">StmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1238" y2="1238"/><ellipse cx="26" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="35" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[475d6310b0690b98eac8d3436b0f8c3b]
+class ExprNode--><rect codeLine="22" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="ExprNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="275" y="1198"/><ellipse cx="353.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M356.2188,1219.6406 Q355.6406,1219.9375 355,1220.0781 Q354.3594,1220.2344 353.6563,1220.2344 Q351.1563,1220.2344 349.8281,1218.5938 Q348.5156,1216.9375 348.5156,1213.8125 Q348.5156,1210.6875 349.8281,1209.0313 Q351.1563,1207.375 353.6563,1207.375 Q354.3594,1207.375 355,1207.5313 Q355.6563,1207.6875 356.2188,1207.9844 L356.2188,1210.7031 Q355.5938,1210.125 355,1209.8594 Q354.4063,1209.5781 353.7813,1209.5781 Q352.4375,1209.5781 351.75,1210.6563 Q351.0625,1211.7188 351.0625,1213.8125 Q351.0625,1215.9063 351.75,1216.9844 Q352.4375,1218.0469 353.7813,1218.0469 Q354.4063,1218.0469 355,1217.7813 Q355.5938,1217.5 356.2188,1216.9219 L356.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="373.75" y="1218.1543">ExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1238" y2="1238"/><ellipse cx="286" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="295" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[a8ff5e7d622655153c4b3f7a4e4aeffe]
+class IRHandle--><rect codeLine="32" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRHandle" style="stroke:#A80036;stroke-width:1.5;" width="225" x="72" y="237.5"/><ellipse cx="152.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M155.7188,259.1406 Q155.1406,259.4375 154.5,259.5781 Q153.8594,259.7344 153.1563,259.7344 Q150.6563,259.7344 149.3281,258.0938 Q148.0156,256.4375 148.0156,253.3125 Q148.0156,250.1875 149.3281,248.5313 Q150.6563,246.875 153.1563,246.875 Q153.8594,246.875 154.5,247.0313 Q155.1563,247.1875 155.7188,247.4844 L155.7188,250.2031 Q155.0938,249.625 154.5,249.3594 Q153.9063,249.0781 153.2813,249.0781 Q151.9375,249.0781 151.25,250.1563 Q150.5625,251.2188 150.5625,253.3125 Q150.5625,255.4063 151.25,256.4844 Q151.9375,257.5469 153.2813,257.5469 Q153.9063,257.5469 154.5,257.2813 Q155.0938,257 155.7188,256.4219 L155.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="173.25" y="257.6543">IRHandle</text><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="277.5" y2="277.5"/><ellipse cx="83" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="92" y="291.7104">void accept(IRVisitorStrict *v) const</text><!--MD5=[45d7a04dc863bc0ed8f0c57430a02d4a]
+class Expr--><rect codeLine="35" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Expr" style="stroke:#A80036;stroke-width:1.5;" width="59" x="7" y="561"/><ellipse cx="22" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M24.9688,582.6406 Q24.3906,582.9375 23.75,583.0781 Q23.1094,583.2344 22.4063,583.2344 Q19.9063,583.2344 18.5781,581.5938 Q17.2656,579.9375 17.2656,576.8125 Q17.2656,573.6875 18.5781,572.0313 Q19.9063,570.375 22.4063,570.375 Q23.1094,570.375 23.75,570.5313 Q24.4063,570.6875 24.9688,570.9844 L24.9688,573.7031 Q24.3438,573.125 23.75,572.8594 Q23.1563,572.5781 22.5313,572.5781 Q21.1875,572.5781 20.5,573.6563 Q19.8125,574.7188 19.8125,576.8125 Q19.8125,578.9063 20.5,579.9844 Q21.1875,581.0469 22.5313,581.0469 Q23.1563,581.0469 23.75,580.7813 Q24.3438,580.5 24.9688,579.9219 L24.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="27" x="36" y="581.1543">Expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="601" y2="601"/><!--MD5=[add513dd89cf3f02144ebc6704fab9f7]
+class Stmt--><rect codeLine="36" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Stmt" style="stroke:#A80036;stroke-width:1.5;" width="61" x="101" y="561"/><ellipse cx="116" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M118.9688,582.6406 Q118.3906,582.9375 117.75,583.0781 Q117.1094,583.2344 116.4063,583.2344 Q113.9063,583.2344 112.5781,581.5938 Q111.2656,579.9375 111.2656,576.8125 Q111.2656,573.6875 112.5781,572.0313 Q113.9063,570.375 116.4063,570.375 Q117.1094,570.375 117.75,570.5313 Q118.4063,570.6875 118.9688,570.9844 L118.9688,573.7031 Q118.3438,573.125 117.75,572.8594 Q117.1563,572.5781 116.5313,572.5781 Q115.1875,572.5781 114.5,573.6563 Q113.8125,574.7188 113.8125,576.8125 Q113.8125,578.9063 114.5,579.9844 Q115.1875,581.0469 116.5313,581.0469 Q117.1563,581.0469 117.75,580.7813 Q118.3438,580.5 118.9688,579.9219 L118.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="29" x="130" y="581.1543">Stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="601" y2="601"/><!--MD5=[927685d34b77cdaffb6bcd7c2ecdcc1a]
+class IRVisitorStrict--><rect codeLine="47" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="262" x="2676.5" y="7"/><ellipse cx="2761.75" cy="23" fill="#B4A7E5" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2757.6719,18.7656 L2757.6719,16.6094 L2765.0625,16.6094 L2765.0625,18.7656 L2762.5938,18.7656 L2762.5938,26.8438 L2765.0625,26.8438 L2765.0625,29 L2757.6719,29 L2757.6719,26.8438 L2760.1406,26.8438 L2760.1406,18.7656 L2757.6719,18.7656 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="83" x="2782.25" y="27.1543">IRVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="39" y2="39"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="47" y2="47"/><ellipse cx="2687.5" cy="58" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="2696.5" y="61.2104">virtual void visit(const IRNode*) const = 0</text><!--MD5=[b78282c203133343885c01c420157c8a]
+class IRVisitor--><rect codeLine="55" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitor" style="stroke:#A80036;stroke-width:1.5;" width="203" x="2387" y="237.5"/><ellipse cx="2459.25" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2462.2188,259.1406 Q2461.6406,259.4375 2461,259.5781 Q2460.3594,259.7344 2459.6563,259.7344 Q2457.1563,259.7344 2455.8281,258.0938 Q2454.5156,256.4375 2454.5156,253.3125 Q2454.5156,250.1875 2455.8281,248.5313 Q2457.1563,246.875 2459.6563,246.875 Q2460.3594,246.875 2461,247.0313 Q2461.6563,247.1875 2462.2188,247.4844 L2462.2188,250.2031 Q2461.5938,249.625 2461,249.3594 Q2460.4063,249.0781 2459.7813,249.0781 Q2458.4375,249.0781 2457.75,250.1563 Q2457.0625,251.2188 2457.0625,253.3125 Q2457.0625,255.4063 2457.75,256.4844 Q2458.4375,257.5469 2459.7813,257.5469 Q2460.4063,257.5469 2461,257.2813 Q2461.5938,257 2462.2188,256.4219 L2462.2188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="50" x="2479.75" y="257.6543">IRVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="277.5" y2="277.5"/><ellipse cx="2398" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="2407" y="291.7104">virtual void visit(const IRNode*)</text><!--MD5=[e7ea7c5d2ec9672a3f65e9628a854185]
+class IRRewriter--><rect codeLine="59" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="137.6328" id="IRRewriter" style="stroke:#A80036;stroke-width:1.5;" width="238" x="2688.5" y="199"/><ellipse cx="2772.25" cy="215" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2775.2188,220.6406 Q2774.6406,220.9375 2774,221.0781 Q2773.3594,221.2344 2772.6563,221.2344 Q2770.1563,221.2344 2768.8281,219.5938 Q2767.5156,217.9375 2767.5156,214.8125 Q2767.5156,211.6875 2768.8281,210.0313 Q2770.1563,208.375 2772.6563,208.375 Q2773.3594,208.375 2774,208.5313 Q2774.6563,208.6875 2775.2188,208.9844 L2775.2188,211.7031 Q2774.5938,211.125 2774,210.8594 Q2773.4063,210.5781 2772.7813,210.5781 Q2771.4375,210.5781 2770.75,211.6563 Q2770.0625,212.7188 2770.0625,214.8125 Q2770.0625,216.9063 2770.75,217.9844 Q2771.4375,219.0469 2772.7813,219.0469 Q2773.4063,219.0469 2774,218.7813 Q2774.5938,218.5 2775.2188,217.9219 L2775.2188,220.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="2792.75" y="219.1543">IRRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="231" y2="231"/><polygon fill="none" points="2699.5,237,2703.5,241,2699.5,245,2695.5,241" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="2708.5" y="245.2104">Expr expr</text><polygon fill="none" points="2699.5,249.8047,2703.5,253.8047,2699.5,257.8047,2695.5,253.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="2708.5" y="258.0151">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="264.6094" y2="264.6094"/><polygon fill="#FFFF44" points="2699.5,270.6094,2703.5,274.6094,2699.5,278.6094,2695.5,274.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="2708.5" y="278.8198">virtual void visit(const ExprNode* op)</text><polygon fill="#FFFF44" points="2699.5,283.4141,2703.5,287.4141,2699.5,291.4141,2695.5,287.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="2708.5" y="291.6245">virtual void visit(const StmtNode* op)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="2712.5" y="304.4292"/><ellipse cx="2699.5" cy="314.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="2708.5" y="317.2339">Expr rewrite(Expr)</text><ellipse cx="2699.5" cy="326.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="106" x="2708.5" y="330.0386">Stmt rewrite(Stmt)</text><!--MD5=[fc5b2d51c8ad612433d8a39d4bdd37c4]
+class IRPrinter--><rect codeLine="71" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="278.4844" id="IRPrinter" style="stroke:#A80036;stroke-width:1.5;" width="430" x="3008.5" y="129"/><ellipse cx="3192.75" cy="145" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3195.7188,150.6406 Q3195.1406,150.9375 3194.5,151.0781 Q3193.8594,151.2344 3193.1563,151.2344 Q3190.6563,151.2344 3189.3281,149.5938 Q3188.0156,147.9375 3188.0156,144.8125 Q3188.0156,141.6875 3189.3281,140.0313 Q3190.6563,138.375 3193.1563,138.375 Q3193.8594,138.375 3194.5,138.5313 Q3195.1563,138.6875 3195.7188,138.9844 L3195.7188,141.7031 Q3195.0938,141.125 3194.5,140.8594 Q3193.9063,140.5781 3193.2813,140.5781 Q3191.9375,140.5781 3191.25,141.6563 Q3190.5625,142.7188 3190.5625,144.8125 Q3190.5625,146.9063 3191.25,147.9844 Q3191.9375,149.0469 3193.2813,149.0469 Q3193.9063,149.0469 3194.5,148.7813 Q3195.0938,148.5 3195.7188,147.9219 L3195.7188,150.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="53" x="3213.25" y="149.1543">IRPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="161" y2="161"/><polygon fill="none" points="3019.5,167,3023.5,171,3019.5,175,3015.5,171" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="129" x="3028.5" y="175.2104">std::ostream &amp;stream</text><polygon fill="none" points="3019.5,179.8047,3023.5,183.8047,3019.5,187.8047,3015.5,183.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="136" x="3028.5" y="188.0151">std::ostream &amp;stream2</text><polygon fill="none" points="3019.5,192.6094,3023.5,196.6094,3019.5,200.6094,3015.5,196.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="53" x="3028.5" y="200.8198">int indent</text><polygon fill="none" points="3019.5,205.4141,3023.5,209.4141,3019.5,213.4141,3015.5,209.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="56" x="3028.5" y="213.6245">bool color</text><polygon fill="none" points="3019.5,218.2188,3023.5,222.2188,3019.5,226.2188,3015.5,222.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="72" x="3028.5" y="226.4292">bool simplify</text><polygon fill="none" points="3019.5,231.0234,3023.5,235.0234,3019.5,239.0234,3015.5,235.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="3028.5" y="239.2339">enum Precedence</text><polygon fill="none" points="3019.5,243.8281,3023.5,247.8281,3019.5,251.8281,3015.5,247.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="3028.5" y="252.0386">Precedence parentPrecedence = BOTTOM</text><polygon fill="none" points="3019.5,256.6328,3023.5,260.6328,3019.5,264.6328,3015.5,260.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="204" x="3028.5" y="264.8433">NameGenerator varNameGenerator</text><polygon fill="none" points="3019.5,269.4375,3023.5,273.4375,3019.5,277.4375,3015.5,273.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="237" x="3028.5" y="277.6479">scopedMap&lt;Expr, std::String&gt; varNames</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="284.2422" y2="284.2422"/><polygon fill="#FFFF44" points="3019.5,290.2422,3023.5,294.2422,3019.5,298.2422,3015.5,294.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="3028.5" y="298.4526">void doIndent()</text><polygon fill="#FFFF44" points="3019.5,303.0469,3023.5,307.0469,3019.5,311.0469,3015.5,307.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="404" x="3028.5" y="311.2573">void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)</text><polygon fill="#FFFF44" points="3019.5,315.8516,3023.5,319.8516,3019.5,323.8516,3015.5,319.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="131" x="3028.5" y="324.062">void fewMoreMethods()</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="336.8667"/><polygon fill="#FFFF44" points="3019.5,341.4609,3023.5,345.4609,3019.5,349.4609,3015.5,345.4609" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="3028.5" y="349.6714">virtual void visit(const ExprNode*)</text><polygon fill="#FFFF44" points="3019.5,354.2656,3023.5,358.2656,3019.5,362.2656,3015.5,358.2656" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="194" x="3028.5" y="362.4761">virtual void visit(const StmtNode*)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="375.2808"/><ellipse cx="3019.5" cy="384.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="112" x="3028.5" y="388.0854">setColor(bool color)</text><ellipse cx="3019.5" cy="397.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="61" x="3028.5" y="400.8901">print(Stmt)</text><!--MD5=[5f8d54360f7c21960948de60fa30257d]
+class IRVerifier--><rect codeLine="92" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IRVerifier" style="stroke:#A80036;stroke-width:1.5;" width="87" x="2288" y="561"/><ellipse cx="2303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2305.9688,582.6406 Q2305.3906,582.9375 2304.75,583.0781 Q2304.1094,583.2344 2303.4063,583.2344 Q2300.9063,583.2344 2299.5781,581.5938 Q2298.2656,579.9375 2298.2656,576.8125 Q2298.2656,573.6875 2299.5781,572.0313 Q2300.9063,570.375 2303.4063,570.375 Q2304.1094,570.375 2304.75,570.5313 Q2305.4063,570.6875 2305.9688,570.9844 L2305.9688,573.7031 Q2305.3438,573.125 2304.75,572.8594 Q2304.1563,572.5781 2303.5313,572.5781 Q2302.1875,572.5781 2301.5,573.6563 Q2300.8125,574.7188 2300.8125,576.8125 Q2300.8125,578.9063 2301.5,579.9844 Q2302.1875,581.0469 2303.5313,581.0469 Q2304.1563,581.0469 2304.75,580.7813 Q2305.3438,580.5 2305.9688,579.9219 L2305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="2317" y="581.1543">IRVerifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="601" y2="601"/><!--MD5=[1e59d9c8d5cb32d21caddc96a281f60c]
+class ExpressionSimplifier--><rect codeLine="101" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.5;" width="156" x="2410.5" y="561"/><ellipse cx="2425.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2428.4688,582.6406 Q2427.8906,582.9375 2427.25,583.0781 Q2426.6094,583.2344 2425.9063,583.2344 Q2423.4063,583.2344 2422.0781,581.5938 Q2420.7656,579.9375 2420.7656,576.8125 Q2420.7656,573.6875 2422.0781,572.0313 Q2423.4063,570.375 2425.9063,570.375 Q2426.6094,570.375 2427.25,570.5313 Q2427.9063,570.6875 2428.4688,570.9844 L2428.4688,573.7031 Q2427.8438,573.125 2427.25,572.8594 Q2426.6563,572.5781 2426.0313,572.5781 Q2424.6875,572.5781 2424,573.6563 Q2423.3125,574.7188 2423.3125,576.8125 Q2423.3125,578.9063 2424,579.9844 Q2424.6875,581.0469 2426.0313,581.0469 Q2426.6563,581.0469 2427.25,580.7813 Q2427.8438,580.5 2428.4688,579.9219 L2428.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="124" x="2439.5" y="581.1543">ExpressionSimplifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="601" y2="601"/><!--MD5=[09d0ace23740abc72ce7e8b4f8ae65c7]
+class RemoveRedundantStatements--><rect codeLine="105" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.5;" width="223" x="2602" y="561"/><ellipse cx="2617" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2619.9688,582.6406 Q2619.3906,582.9375 2618.75,583.0781 Q2618.1094,583.2344 2617.4063,583.2344 Q2614.9063,583.2344 2613.5781,581.5938 Q2612.2656,579.9375 2612.2656,576.8125 Q2612.2656,573.6875 2613.5781,572.0313 Q2614.9063,570.375 2617.4063,570.375 Q2618.1094,570.375 2618.75,570.5313 Q2619.4063,570.6875 2619.9688,570.9844 L2619.9688,573.7031 Q2619.3438,573.125 2618.75,572.8594 Q2618.1563,572.5781 2617.5313,572.5781 Q2616.1875,572.5781 2615.5,573.6563 Q2614.8125,574.7188 2614.8125,576.8125 Q2614.8125,578.9063 2615.5,579.9844 Q2616.1875,581.0469 2617.5313,581.0469 Q2618.1563,581.0469 2618.75,580.7813 Q2619.3438,580.5 2619.9688,579.9219 L2619.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="191" x="2631" y="581.1543">RemoveRedundantStatements</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="601" y2="601"/><!--MD5=[8dd11208bc782b9bc4fe9a727775ac71]
+class RemoveRedundantLoops--><rect codeLine="106" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.5;" width="187" x="2860" y="561"/><ellipse cx="2875" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2877.9688,582.6406 Q2877.3906,582.9375 2876.75,583.0781 Q2876.1094,583.2344 2875.4063,583.2344 Q2872.9063,583.2344 2871.5781,581.5938 Q2870.2656,579.9375 2870.2656,576.8125 Q2870.2656,573.6875 2871.5781,572.0313 Q2872.9063,570.375 2875.4063,570.375 Q2876.1094,570.375 2876.75,570.5313 Q2877.4063,570.6875 2877.9688,570.9844 L2877.9688,573.7031 Q2877.3438,573.125 2876.75,572.8594 Q2876.1563,572.5781 2875.5313,572.5781 Q2874.1875,572.5781 2873.5,573.6563 Q2872.8125,574.7188 2872.8125,576.8125 Q2872.8125,578.9063 2873.5,579.9844 Q2874.1875,581.0469 2875.5313,581.0469 Q2876.1563,581.0469 2876.75,580.7813 Q2877.3438,580.5 2877.9688,579.9219 L2877.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="155" x="2889" y="581.1543">RemoveRedundantLoops</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="601" y2="601"/><!--MD5=[85eaa2c6ee966b219cfed7e8ed27a206]
+class RemoveDuplicateBody--><rect codeLine="107" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.5;" width="170" x="3082.5" y="561"/><ellipse cx="3097.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3100.4688,582.6406 Q3099.8906,582.9375 3099.25,583.0781 Q3098.6094,583.2344 3097.9063,583.2344 Q3095.4063,583.2344 3094.0781,581.5938 Q3092.7656,579.9375 3092.7656,576.8125 Q3092.7656,573.6875 3094.0781,572.0313 Q3095.4063,570.375 3097.9063,570.375 Q3098.6094,570.375 3099.25,570.5313 Q3099.9063,570.6875 3100.4688,570.9844 L3100.4688,573.7031 Q3099.8438,573.125 3099.25,572.8594 Q3098.6563,572.5781 3098.0313,572.5781 Q3096.6875,572.5781 3096,573.6563 Q3095.3125,574.7188 3095.3125,576.8125 Q3095.3125,578.9063 3096,579.9844 Q3096.6875,581.0469 3098.0313,581.0469 Q3098.6563,581.0469 3099.25,580.7813 Q3099.8438,580.5 3100.4688,579.9219 L3100.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="3111.5" y="581.1543">RemoveDuplicateBody</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="601" y2="601"/><!--MD5=[781eb37a56bb69dce1ac0e85789010ac]
+class CodeGen--><rect codeLine="115" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen" style="stroke:#A80036;stroke-width:1.5;" width="89" x="3288" y="561"/><ellipse cx="3303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3305.9688,582.6406 Q3305.3906,582.9375 3304.75,583.0781 Q3304.1094,583.2344 3303.4063,583.2344 Q3300.9063,583.2344 3299.5781,581.5938 Q3298.2656,579.9375 3298.2656,576.8125 Q3298.2656,573.6875 3299.5781,572.0313 Q3300.9063,570.375 3303.4063,570.375 Q3304.1094,570.375 3304.75,570.5313 Q3305.4063,570.6875 3305.9688,570.9844 L3305.9688,573.7031 Q3305.3438,573.125 3304.75,572.8594 Q3304.1563,572.5781 3303.5313,572.5781 Q3302.1875,572.5781 3301.5,573.6563 Q3300.8125,574.7188 3300.8125,576.8125 Q3300.8125,578.9063 3301.5,579.9844 Q3302.1875,581.0469 3303.5313,581.0469 Q3304.1563,581.0469 3304.75,580.7813 Q3305.3438,580.5 3305.9688,579.9219 L3305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="57" x="3317" y="581.1543">CodeGen</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="601" y2="601"/><!--MD5=[1c66665a05557eaba0ef54dbe8329f75]
+class CodeGen_C--><rect codeLine="116" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_C" style="stroke:#A80036;stroke-width:1.5;" width="103" x="3130" y="830"/><ellipse cx="3145" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3147.9688,851.6406 Q3147.3906,851.9375 3146.75,852.0781 Q3146.1094,852.2344 3145.4063,852.2344 Q3142.9063,852.2344 3141.5781,850.5938 Q3140.2656,848.9375 3140.2656,845.8125 Q3140.2656,842.6875 3141.5781,841.0313 Q3142.9063,839.375 3145.4063,839.375 Q3146.1094,839.375 3146.75,839.5313 Q3147.4063,839.6875 3147.9688,839.9844 L3147.9688,842.7031 Q3147.3438,842.125 3146.75,841.8594 Q3146.1563,841.5781 3145.5313,841.5781 Q3144.1875,841.5781 3143.5,842.6563 Q3142.8125,843.7188 3142.8125,845.8125 Q3142.8125,847.9063 3143.5,848.9844 Q3144.1875,850.0469 3145.5313,850.0469 Q3146.1563,850.0469 3146.75,849.7813 Q3147.3438,849.5 3147.9688,848.9219 L3147.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="3159" y="850.1543">CodeGen_C</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="870" y2="870"/><!--MD5=[b05ffbf1810bcc29bd244a8644dcab5e]
+class CodeGen_CUDA--><rect codeLine="117" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.5;" width="129" x="3268" y="830"/><ellipse cx="3283" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3285.9688,851.6406 Q3285.3906,851.9375 3284.75,852.0781 Q3284.1094,852.2344 3283.4063,852.2344 Q3280.9063,852.2344 3279.5781,850.5938 Q3278.2656,848.9375 3278.2656,845.8125 Q3278.2656,842.6875 3279.5781,841.0313 Q3280.9063,839.375 3283.4063,839.375 Q3284.1094,839.375 3284.75,839.5313 Q3285.4063,839.6875 3285.9688,839.9844 L3285.9688,842.7031 Q3285.3438,842.125 3284.75,841.8594 Q3284.1563,841.5781 3283.5313,841.5781 Q3282.1875,841.5781 3281.5,842.6563 Q3280.8125,843.7188 3280.8125,845.8125 Q3280.8125,847.9063 3281.5,848.9844 Q3282.1875,850.0469 3283.5313,850.0469 Q3284.1563,850.0469 3284.75,849.7813 Q3285.3438,849.5 3285.9688,848.9219 L3285.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="97" x="3297" y="850.1543">CodeGen_CUDA</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="870" y2="870"/><!--MD5=[e6fabe1c34e0f779d9281ebc64edf122]
+class CodeGen_ISPC--><rect codeLine="118" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.5;" width="122" x="3432.5" y="830"/><ellipse cx="3447.5" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3450.4688,851.6406 Q3449.8906,851.9375 3449.25,852.0781 Q3448.6094,852.2344 3447.9063,852.2344 Q3445.4063,852.2344 3444.0781,850.5938 Q3442.7656,848.9375 3442.7656,845.8125 Q3442.7656,842.6875 3444.0781,841.0313 Q3445.4063,839.375 3447.9063,839.375 Q3448.6094,839.375 3449.25,839.5313 Q3449.9063,839.6875 3450.4688,839.9844 L3450.4688,842.7031 Q3449.8438,842.125 3449.25,841.8594 Q3448.6563,841.5781 3448.0313,841.5781 Q3446.6875,841.5781 3446,842.6563 Q3445.3125,843.7188 3445.3125,845.8125 Q3445.3125,847.9063 3446,848.9844 Q3446.6875,850.0469 3448.0313,850.0469 Q3448.6563,850.0469 3449.25,849.7813 Q3449.8438,849.5 3450.4688,848.9219 L3450.4688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="90" x="3461.5" y="850.1543">CodeGen_ISPC</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="870" y2="870"/><!--MD5=[a8e9f8a103380e23aa8687dbc5a94fb7]
+class Manageable--><rect codeLine="126" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Manageable" style="stroke:#A80036;stroke-width:1.5;" width="109" x="1221" y="244"/><ellipse cx="1236" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1238.9688,265.6406 Q1238.3906,265.9375 1237.75,266.0781 Q1237.1094,266.2344 1236.4063,266.2344 Q1233.9063,266.2344 1232.5781,264.5938 Q1231.2656,262.9375 1231.2656,259.8125 Q1231.2656,256.6875 1232.5781,255.0313 Q1233.9063,253.375 1236.4063,253.375 Q1237.1094,253.375 1237.75,253.5313 Q1238.4063,253.6875 1238.9688,253.9844 L1238.9688,256.7031 Q1238.3438,256.125 1237.75,255.8594 Q1237.1563,255.5781 1236.5313,255.5781 Q1235.1875,255.5781 1234.5,256.6563 Q1233.8125,257.7188 1233.8125,259.8125 Q1233.8125,261.9063 1234.5,262.9844 Q1235.1875,264.0469 1236.5313,264.0469 Q1237.1563,264.0469 1237.75,263.7813 Q1238.3438,263.5 1238.9688,262.9219 L1238.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="77" x="1250" y="264.1543">Manageable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="284" y2="284"/><!--MD5=[b230114a6dc80ef25a3e5e6e95ae886a]
+class IndexStmtNode--><rect codeLine="127" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="521" y="554.5"/><ellipse cx="631.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M634.7188,576.1406 Q634.1406,576.4375 633.5,576.5781 Q632.8594,576.7344 632.1563,576.7344 Q629.6563,576.7344 628.3281,575.0938 Q627.0156,573.4375 627.0156,570.3125 Q627.0156,567.1875 628.3281,565.5313 Q629.6563,563.875 632.1563,563.875 Q632.8594,563.875 633.5,564.0313 Q634.1563,564.1875 634.7188,564.4844 L634.7188,567.2031 Q634.0938,566.625 633.5,566.3594 Q632.9063,566.0781 632.2813,566.0781 Q630.9375,566.0781 630.25,567.1563 Q629.5625,568.2188 629.5625,570.3125 Q629.5625,572.4063 630.25,573.4844 Q630.9375,574.5469 632.2813,574.5469 Q632.9063,574.5469 633.5,574.2813 Q634.0938,574 634.7188,573.4219 L634.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="95" x="652.25" y="574.6543">IndexStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="529" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="541" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[d94a097bbd14b86b446d6c306c6327b3]
+class IndexExprNode--><rect codeLine="130" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexExprNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="1281" y="554.5"/><ellipse cx="1392.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1395.7188,576.1406 Q1395.1406,576.4375 1394.5,576.5781 Q1393.8594,576.7344 1393.1563,576.7344 Q1390.6563,576.7344 1389.3281,575.0938 Q1388.0156,573.4375 1388.0156,570.3125 Q1388.0156,567.1875 1389.3281,565.5313 Q1390.6563,563.875 1393.1563,563.875 Q1393.8594,563.875 1394.5,564.0313 Q1395.1563,564.1875 1395.7188,564.4844 L1395.7188,567.2031 Q1395.0938,566.625 1394.5,566.3594 Q1393.9063,566.0781 1393.2813,566.0781 Q1391.9375,566.0781 1391.25,567.1563 Q1390.5625,568.2188 1390.5625,570.3125 Q1390.5625,572.4063 1391.25,573.4844 Q1391.9375,574.5469 1393.2813,574.5469 Q1393.9063,574.5469 1394.5,574.2813 Q1395.0938,574 1395.7188,573.4219 L1395.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="1413.25" y="574.6543">IndexExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="1289" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="1301" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[2ae3d0d839308205eb4a3976239628b6]
+class IndexStmt--><rect codeLine="140" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexStmt" style="stroke:#A80036;stroke-width:1.5;" width="94" x="636.5" y="244"/><ellipse cx="651.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M654.4688,265.6406 Q653.8906,265.9375 653.25,266.0781 Q652.6094,266.2344 651.9063,266.2344 Q649.4063,266.2344 648.0781,264.5938 Q646.7656,262.9375 646.7656,259.8125 Q646.7656,256.6875 648.0781,255.0313 Q649.4063,253.375 651.9063,253.375 Q652.6094,253.375 653.25,253.5313 Q653.9063,253.6875 654.4688,253.9844 L654.4688,256.7031 Q653.8438,256.125 653.25,255.8594 Q652.6563,255.5781 652.0313,255.5781 Q650.6875,255.5781 650,256.6563 Q649.3125,257.7188 649.3125,259.8125 Q649.3125,261.9063 650,262.9844 Q650.6875,264.0469 652.0313,264.0469 Q652.6563,264.0469 653.25,263.7813 Q653.8438,263.5 654.4688,262.9219 L654.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="665.5" y="264.1543">IndexStmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="284" y2="284"/><!--MD5=[97c64a8910e96953a95fad8b92c83bb0]
+class IndexExpr--><rect codeLine="141" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexExpr" style="stroke:#A80036;stroke-width:1.5;" width="92" x="1374.5" y="244"/><ellipse cx="1389.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1392.4688,265.6406 Q1391.8906,265.9375 1391.25,266.0781 Q1390.6094,266.2344 1389.9063,266.2344 Q1387.4063,266.2344 1386.0781,264.5938 Q1384.7656,262.9375 1384.7656,259.8125 Q1384.7656,256.6875 1386.0781,255.0313 Q1387.4063,253.375 1389.9063,253.375 Q1390.6094,253.375 1391.25,253.5313 Q1391.9063,253.6875 1392.4688,253.9844 L1392.4688,256.7031 Q1391.8438,256.125 1391.25,255.8594 Q1390.6563,255.5781 1390.0313,255.5781 Q1388.6875,255.5781 1388,256.6563 Q1387.3125,257.7188 1387.3125,259.8125 Q1387.3125,261.9063 1388,262.9844 Q1388.6875,264.0469 1390.0313,264.0469 Q1390.6563,264.0469 1391.25,263.7813 Q1391.8438,263.5 1392.4688,262.9219 L1392.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="1403.5" y="264.1543">IndexExpr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="284" y2="284"/><!--MD5=[28b0f4e593c8487512a9debc1bac1917]
+class IndexExprVisitorStrict--><rect codeLine="149" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="201.6563" id="IndexExprVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="283" x="1641" y="484"/><ellipse cx="1711.25" cy="500" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.3594,495.3438 L1710.2031,500.4219 L1712.5313,500.4219 L1711.3594,495.3438 Z M1709.875,493.1094 L1712.8594,493.1094 L1716.2188,505.5 L1713.7656,505.5 L1713,502.4375 L1709.7188,502.4375 L1708.9688,505.5 L1706.5313,505.5 L1709.875,493.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="134" x="1731.75" y="504.1543">IndexExprVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="516" y2="516"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="524" y2="524"/><ellipse cx="1652" cy="535" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1661" y="538.2104">void visit(const IndexStmt&amp;)</text><ellipse cx="1652" cy="547.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1661" y="551.0151">virtual void visit(const AccessNode*) = 0</text><ellipse cx="1652" cy="560.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1661" y="563.8198">virtual void visit(const LiteralNode*) = 0</text><ellipse cx="1652" cy="573.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="576.6245">virtual void visit(const NegNode*) = 0</text><ellipse cx="1652" cy="586.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="1661" y="589.4292">virtual void visit(const AddNode*) = 0</text><ellipse cx="1652" cy="599.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="602.2339">virtual void visit(const SubNode*) = 0</text><ellipse cx="1652" cy="611.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="1661" y="615.0386">virtual void visit(const MulNode*) = 0</text><ellipse cx="1652" cy="624.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1661" y="627.8433">virtual void visit(const DivNode*) = 0</text><ellipse cx="1652" cy="637.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1661" y="640.6479">virtual void visit(const SqrtNode*) = 0</text><ellipse cx="1652" cy="650.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1661" y="653.4526">virtual void visit(const CastNode*) = 0</text><ellipse cx="1652" cy="663.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1661" y="666.2573">virtual void visit(const CallIntrinsicNode*) = 0</text><ellipse cx="1652" cy="675.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="246" x="1661" y="679.062">virtual void visit(const ReductionNode*) = 0</text><!--MD5=[a89aadb6ea0d27c41410991969988628]
+class IndexStmtVisitorStrict--><rect codeLine="163" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="163.2422" id="IndexStmtVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="284" x="1968.5" y="503.5"/><ellipse cx="2038.75" cy="519.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2038.8594,514.8438 L2037.7031,519.9219 L2040.0313,519.9219 L2038.8594,514.8438 Z M2037.375,512.6094 L2040.3594,512.6094 L2043.7188,525 L2041.2656,525 L2040.5,521.9375 L2037.2188,521.9375 L2036.4688,525 L2034.0313,525 L2037.375,512.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="135" x="2059.25" y="523.6543">IndexStmtVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="535.5" y2="535.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="543.5" y2="543.5"/><ellipse cx="1979.5" cy="554.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1988.5" y="557.7104">void visit(const IndexStmt&amp;)</text><ellipse cx="1979.5" cy="567.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="258" x="1988.5" y="570.5151">virtual void visit(const AssignmentNode*) = 0</text><ellipse cx="1979.5" cy="580.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1988.5" y="583.3198">virtual void visit(const YieldNode*) = 0</text><ellipse cx="1979.5" cy="592.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="222" x="1988.5" y="596.1245">virtual void visit(const ForallNode*) = 0</text><ellipse cx="1979.5" cy="605.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1988.5" y="608.9292">virtual void visit(const WhereNode*) = 0</text><ellipse cx="1979.5" cy="618.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="621.7339">virtual void visit(const SequenceNode*) = 0</text><ellipse cx="1979.5" cy="631.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="634.5386">virtual void visit(const AssembleNode*) = 0</text><ellipse cx="1979.5" cy="644.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1988.5" y="647.3433">virtual void visit(const MultiNode*) = 0</text><ellipse cx="1979.5" cy="656.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="660.1479">virtual void visit(const SuchThatNode*) = 0</text><!--MD5=[b74718248e125c8ad329889fd2a32c16]
+class IndexNotationVisitorStrict--><rect codeLine="175" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="192" x="1404.5" y="830"/><ellipse cx="1419.5" cy="846" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1419.6094,841.3438 L1418.4531,846.4219 L1420.7813,846.4219 L1419.6094,841.3438 Z M1418.125,839.1094 L1421.1094,839.1094 L1424.4688,851.5 L1422.0156,851.5 L1421.25,848.4375 L1417.9688,848.4375 L1417.2188,851.5 L1414.7813,851.5 L1418.125,839.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="160" x="1433.5" y="850.1543">IndexNotationVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="870" y2="870"/><!--MD5=[cb464207dbcea0ece296242645495747]
+class IndexNotationPrinter--><rect codeLine="176" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="368.1172" id="IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.5;" width="253" x="1301" y="1044.5"/><ellipse cx="1358.75" cy="1060.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1361.7188,1066.1406 Q1361.1406,1066.4375 1360.5,1066.5781 Q1359.8594,1066.7344 1359.1563,1066.7344 Q1356.6563,1066.7344 1355.3281,1065.0938 Q1354.0156,1063.4375 1354.0156,1060.3125 Q1354.0156,1057.1875 1355.3281,1055.5313 Q1356.6563,1053.875 1359.1563,1053.875 Q1359.8594,1053.875 1360.5,1054.0313 Q1361.1563,1054.1875 1361.7188,1054.4844 L1361.7188,1057.2031 Q1361.0938,1056.625 1360.5,1056.3594 Q1359.9063,1056.0781 1359.2813,1056.0781 Q1357.9375,1056.0781 1357.25,1057.1563 Q1356.5625,1058.2188 1356.5625,1060.3125 Q1356.5625,1062.4063 1357.25,1063.4844 Q1357.9375,1064.5469 1359.2813,1064.5469 Q1359.9063,1064.5469 1360.5,1064.2813 Q1361.0938,1064 1361.7188,1063.4219 L1361.7188,1066.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="129" x="1379.25" y="1064.6543">IndexNotationPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1076.5" y2="1076.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1084.5" y2="1084.5"/><ellipse cx="1312" cy="1095.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="190" x="1321" y="1098.7104">void print(const IndexExpr&amp; expr)</text><ellipse cx="1312" cy="1108.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="1321" y="1111.5151">void print(const IndexStmt&amp; expr)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1124.3198"/><ellipse cx="1312" cy="1133.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1321" y="1137.1245">void visit(const AccessNode* node)</text><ellipse cx="1312" cy="1146.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1149.9292">void visit(const LiteralNode* node)</text><ellipse cx="1312" cy="1159.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1162.7339">void visit(const NegNode* node)</text><ellipse cx="1312" cy="1172.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1321" y="1175.5386">void visit(const AddNode* node)</text><ellipse cx="1312" cy="1185.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1188.3433">void visit(const SubNode* node)</text><ellipse cx="1312" cy="1197.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1321" y="1201.1479">void visit(const MulNode* node)</text><ellipse cx="1312" cy="1210.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1321" y="1213.9526">void visit(const DivNode* node)</text><ellipse cx="1312" cy="1223.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1321" y="1226.7573">void visit(const SqrtNode* node)</text><ellipse cx="1312" cy="1236.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1239.562">void visit(const CastNode* node)</text><ellipse cx="1312" cy="1249.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1321" y="1252.3667">void visit(const CallIntrinsicNode* node)</text><ellipse cx="1312" cy="1261.9609" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1321" y="1265.1714">void visit(const UnaryExprNode* node)</text><ellipse cx="1312" cy="1274.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1321" y="1277.9761">void visit(const BinaryExprNode* node)</text><ellipse cx="1312" cy="1287.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1321" y="1290.7808">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1303.5854"/><ellipse cx="1312" cy="1313.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1321" y="1316.3901">void visit(const AssignmentNode* node)</text><ellipse cx="1312" cy="1325.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1321" y="1329.1948">void visit(const YieldNode* node)</text><ellipse cx="1312" cy="1338.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1321" y="1341.9995">void visit(const ForallNode* node)</text><ellipse cx="1312" cy="1351.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1354.8042">void visit(const WhereNode* node)</text><ellipse cx="1312" cy="1364.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1367.6089">void visit(const SequenceNode* node)</text><ellipse cx="1312" cy="1377.2031" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1380.4136">void visit(const AssembleNode* node)</text><ellipse cx="1312" cy="1390.0078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1393.2183">void visit(const MultiNode* node)</text><ellipse cx="1312" cy="1402.8125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1321" y="1406.0229">void visit(const SuchThatNode* node)</text><!--MD5=[1889949f301ae6d76cb20e56f2d1d951]
+class IndexNotationVisitor--><rect codeLine="205" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1589.5" y="1063.5"/><ellipse cx="1668.25" cy="1079.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1671.2188,1085.1406 Q1670.6406,1085.4375 1670,1085.5781 Q1669.3594,1085.7344 1668.6563,1085.7344 Q1666.1563,1085.7344 1664.8281,1084.0938 Q1663.5156,1082.4375 1663.5156,1079.3125 Q1663.5156,1076.1875 1664.8281,1074.5313 Q1666.1563,1072.875 1668.6563,1072.875 Q1669.3594,1072.875 1670,1073.0313 Q1670.6563,1073.1875 1671.2188,1073.4844 L1671.2188,1076.2031 Q1670.5938,1075.625 1670,1075.3594 Q1669.4063,1075.0781 1668.7813,1075.0781 Q1667.4375,1075.0781 1666.75,1076.1563 Q1666.0625,1077.2188 1666.0625,1079.3125 Q1666.0625,1081.4063 1666.75,1082.4844 Q1667.4375,1083.5469 1668.7813,1083.5469 Q1669.4063,1083.5469 1670,1083.2813 Q1670.5938,1083 1671.2188,1082.4219 L1671.2188,1085.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="126" x="1688.75" y="1083.6543">IndexNotationVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1095.5" y2="1095.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1103.5" y2="1103.5"/><ellipse cx="1600.5" cy="1114.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1609.5" y="1117.7104">virtual void visit(const AccessNode* node)</text><ellipse cx="1600.5" cy="1127.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1130.5151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1600.5" cy="1140.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1143.3198">virtual void visit(const NegNode* node)</text><ellipse cx="1600.5" cy="1152.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1609.5" y="1156.1245">virtual void visit(const AddNode* node)</text><ellipse cx="1600.5" cy="1165.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1168.9292">virtual void visit(const SubNode* node)</text><ellipse cx="1600.5" cy="1178.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1609.5" y="1181.7339">virtual void visit(const MulNode* node)</text><ellipse cx="1600.5" cy="1191.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1609.5" y="1194.5386">virtual void visit(const DivNode* node)</text><ellipse cx="1600.5" cy="1204.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1609.5" y="1207.3433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1600.5" cy="1216.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1220.1479">virtual void visit(const CastNode* node)</text><ellipse cx="1600.5" cy="1229.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1609.5" y="1232.9526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1600.5" cy="1242.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1609.5" y="1245.7573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1600.5" cy="1255.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1609.5" y="1258.562">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1600.5" cy="1268.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1609.5" y="1271.3667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1613.5" y="1284.1714"/><ellipse cx="1600.5" cy="1293.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1609.5" y="1296.9761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1600.5" cy="1306.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1609.5" y="1309.7808">virtual void visit(const YieldNode* node)</text><ellipse cx="1600.5" cy="1319.375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1609.5" y="1322.5854">virtual void visit(const ForallNode* node)</text><ellipse cx="1600.5" cy="1332.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1335.3901">virtual void visit(const WhereNode* node)</text><ellipse cx="1600.5" cy="1344.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1348.1948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1600.5" cy="1357.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1360.9995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1600.5" cy="1370.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1373.8042">virtual void visit(const MultiNode* node)</text><ellipse cx="1600.5" cy="1383.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1609.5" y="1386.6089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[c249847c086044a14a4ecd1d09905030]
+class Matcher--><rect codeLine="231" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Matcher" style="stroke:#A80036;stroke-width:1.5;" width="83" x="1694" y="1621"/><ellipse cx="1709" cy="1637" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.9688,1642.6406 Q1711.3906,1642.9375 1710.75,1643.0781 Q1710.1094,1643.2344 1709.4063,1643.2344 Q1706.9063,1643.2344 1705.5781,1641.5938 Q1704.2656,1639.9375 1704.2656,1636.8125 Q1704.2656,1633.6875 1705.5781,1632.0313 Q1706.9063,1630.375 1709.4063,1630.375 Q1710.1094,1630.375 1710.75,1630.5313 Q1711.4063,1630.6875 1711.9688,1630.9844 L1711.9688,1633.7031 Q1711.3438,1633.125 1710.75,1632.8594 Q1710.1563,1632.5781 1709.5313,1632.5781 Q1708.1875,1632.5781 1707.5,1633.6563 Q1706.8125,1634.7188 1706.8125,1636.8125 Q1706.8125,1638.9063 1707.5,1639.9844 Q1708.1875,1641.0469 1709.5313,1641.0469 Q1710.1563,1641.0469 1710.75,1640.7813 Q1711.3438,1640.5 1711.9688,1639.9219 L1711.9688,1642.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="51" x="1723" y="1641.1543">Matcher</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1653" y2="1653"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1661" y2="1661"/><!--MD5=[ea8f53988b378f12e96f95ad2b8e8e7e]
+class IndexExprRewriterStrict--><rect codeLine="235" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="214.4609" id="IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="301" x="1632" y="747"/><ellipse cx="1704.25" cy="763" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1704.3594,758.3438 L1703.2031,763.4219 L1705.5313,763.4219 L1704.3594,758.3438 Z M1702.875,756.1094 L1705.8594,756.1094 L1709.2188,768.5 L1706.7656,768.5 L1706,765.4375 L1702.7188,765.4375 L1701.9688,768.5 L1699.5313,768.5 L1702.875,756.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="148" x="1724.75" y="767.1543">IndexExprRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="779" y2="779"/><polygon fill="none" points="1643,785,1647,789,1643,793,1639,789" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="1652" y="793.2104">IndexExpr expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="799.8047" y2="799.8047"/><ellipse cx="1643" cy="810.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="162" x="1652" y="814.0151">IndexExpr rewrite(IndexExpr)</text><polygon fill="#FFFF44" points="1643,818.6094,1647,822.6094,1643,826.6094,1639,822.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="249" x="1652" y="826.8198">virtual void visit(const AccessNode* op) = 0</text><polygon fill="#FFFF44" points="1643,831.4141,1647,835.4141,1643,839.4141,1639,835.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1652" y="839.6245">virtual void visit(const LiteralNode* op) = 0</text><polygon fill="#FFFF44" points="1643,844.2188,1647,848.2188,1643,852.2188,1639,848.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="852.4292">virtual void visit(const NegNode* op) = 0</text><polygon fill="#FFFF44" points="1643,857.0234,1647,861.0234,1643,865.0234,1639,861.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="233" x="1652" y="865.2339">virtual void visit(const SqrtNode* op) = 0</text><polygon fill="#FFFF44" points="1643,869.8281,1647,873.8281,1643,877.8281,1639,873.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1652" y="878.0386">virtual void visit(const AddNode* op) = 0</text><polygon fill="#FFFF44" points="1643,882.6328,1647,886.6328,1643,890.6328,1639,886.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="890.8433">virtual void visit(const SubNode* op) = 0</text><polygon fill="#FFFF44" points="1643,895.4375,1647,899.4375,1643,903.4375,1639,899.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="228" x="1652" y="903.6479">virtual void visit(const MulNode* op) = 0</text><polygon fill="#FFFF44" points="1643,908.2422,1647,912.2422,1643,916.2422,1639,912.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1652" y="916.4526">virtual void visit(const DivNode* op) = 0</text><polygon fill="#FFFF44" points="1643,921.0469,1647,925.0469,1643,929.0469,1639,925.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1652" y="929.2573">virtual void visit(const CastNode* op) = 0</text><polygon fill="#FFFF44" points="1643,933.8516,1647,937.8516,1643,941.8516,1639,937.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="275" x="1652" y="942.062">virtual void visit(const CallIntrinsicNode* op) = 0</text><polygon fill="#FFFF44" points="1643,946.6563,1647,950.6563,1643,954.6563,1639,950.6563" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="264" x="1652" y="954.8667">virtual void visit(const ReductionNode* op) = 0</text><!--MD5=[fce5a5c177cad31ce6c931f148bb8f55]
+class IndexStmtRewriterStrict--><rect codeLine="252" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="176.0469" id="IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="302" x="1968.5" y="766"/><ellipse cx="2040.75" cy="782" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2040.8594,777.3438 L2039.7031,782.4219 L2042.0313,782.4219 L2040.8594,777.3438 Z M2039.375,775.1094 L2042.3594,775.1094 L2045.7188,787.5 L2043.2656,787.5 L2042.5,784.4375 L2039.2188,784.4375 L2038.4688,787.5 L2036.0313,787.5 L2039.375,775.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="149" x="2061.25" y="786.1543">IndexStmtRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="798" y2="798"/><polygon fill="none" points="1979.5,804,1983.5,808,1979.5,812,1975.5,808" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="87" x="1988.5" y="812.2104">IndexStmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="818.8047" y2="818.8047"/><ellipse cx="1979.5" cy="829.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="1988.5" y="833.0151">IndexStmt rewrite(IndexStmt)</text><polygon fill="#FFFF44" points="1979.5,837.6094,1983.5,841.6094,1979.5,845.6094,1975.5,841.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="1988.5" y="845.8198">virtual void visit(const AssignmentNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,850.4141,1983.5,854.4141,1979.5,858.4141,1975.5,854.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="1988.5" y="858.6245">virtual void visit(const YieldNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,863.2188,1983.5,867.2188,1979.5,871.2188,1975.5,867.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="240" x="1988.5" y="871.4292">virtual void visit(const ForallNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,876.0234,1983.5,880.0234,1979.5,884.0234,1975.5,880.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="884.2339">virtual void visit(const WhereNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,888.8281,1983.5,892.8281,1979.5,896.8281,1975.5,892.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="897.0386">virtual void visit(const SequenceNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,901.6328,1983.5,905.6328,1979.5,909.6328,1975.5,905.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="909.8433">virtual void visit(const AssembleNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,914.4375,1983.5,918.4375,1979.5,922.4375,1975.5,918.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1988.5" y="922.6479">virtual void visit(const MultiNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,927.2422,1983.5,931.2422,1979.5,935.2422,1975.5,931.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="1988.5" y="935.4526">virtual void visit(const SuchThatNode* op) = 0</text><!--MD5=[c34474f968cd689ed26c36a6e449f9a5]
+class IndexNotationRewriterStrict--><rect codeLine="266" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="206" x="1966.5" y="1204.5"/><ellipse cx="1981.5" cy="1220.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1981.6094,1215.8438 L1980.4531,1220.9219 L1982.7813,1220.9219 L1981.6094,1215.8438 Z M1980.125,1213.6094 L1983.1094,1213.6094 L1986.4688,1226 L1984.0156,1226 L1983.25,1222.9375 L1979.9688,1222.9375 L1979.2188,1226 L1976.7813,1226 L1980.125,1213.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="174" x="1995.5" y="1224.6543">IndexNotationRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1236.5" y2="1236.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1244.5" y2="1244.5"/><!--MD5=[f43b50a501af9b122d481161df5564ac]
+class IndexNotationRewriter--><rect codeLine="267" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1923.5" y="1480"/><ellipse cx="1996.25" cy="1496" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1999.2188,1501.6406 Q1998.6406,1501.9375 1998,1502.0781 Q1997.3594,1502.2344 1996.6563,1502.2344 Q1994.1563,1502.2344 1992.8281,1500.5938 Q1991.5156,1498.9375 1991.5156,1495.8125 Q1991.5156,1492.6875 1992.8281,1491.0313 Q1994.1563,1489.375 1996.6563,1489.375 Q1997.3594,1489.375 1998,1489.5313 Q1998.6563,1489.6875 1999.2188,1489.9844 L1999.2188,1492.7031 Q1998.5938,1492.125 1998,1491.8594 Q1997.4063,1491.5781 1996.7813,1491.5781 Q1995.4375,1491.5781 1994.75,1492.6563 Q1994.0625,1493.7188 1994.0625,1495.8125 Q1994.0625,1497.9063 1994.75,1498.9844 Q1995.4375,1500.0469 1996.7813,1500.0469 Q1997.4063,1500.0469 1998,1499.7813 Q1998.5938,1499.5 1999.2188,1498.9219 L1999.2188,1501.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="2016.75" y="1500.1543">IndexNotationRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1512" y2="1512"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1520" y2="1520"/><ellipse cx="1934.5" cy="1531" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1943.5" y="1534.2104">virtual void visit(const AccessNode* node)</text><ellipse cx="1934.5" cy="1543.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1547.0151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1934.5" cy="1556.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1559.8198">virtual void visit(const NegNode* node)</text><ellipse cx="1934.5" cy="1569.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1943.5" y="1572.6245">virtual void visit(const AddNode* node)</text><ellipse cx="1934.5" cy="1582.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1585.4292">virtual void visit(const SubNode* node)</text><ellipse cx="1934.5" cy="1595.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1943.5" y="1598.2339">virtual void visit(const MulNode* node)</text><ellipse cx="1934.5" cy="1607.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1943.5" y="1611.0386">virtual void visit(const DivNode* node)</text><ellipse cx="1934.5" cy="1620.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1943.5" y="1623.8433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1934.5" cy="1633.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1636.6479">virtual void visit(const CastNode* node)</text><ellipse cx="1934.5" cy="1646.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1943.5" y="1649.4526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1934.5" cy="1659.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1943.5" y="1662.2573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1934.5" cy="1671.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1943.5" y="1675.062">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1934.5" cy="1684.6563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1943.5" y="1687.8667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1947.5" y="1700.6714"/><ellipse cx="1934.5" cy="1710.2656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1943.5" y="1713.4761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1934.5" cy="1723.0703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1943.5" y="1726.2808">virtual void visit(const YieldNode* node)</text><ellipse cx="1934.5" cy="1735.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1943.5" y="1739.0854">virtual void visit(const ForallNode* node)</text><ellipse cx="1934.5" cy="1748.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1751.8901">virtual void visit(const WhereNode* node)</text><ellipse cx="1934.5" cy="1761.4844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1764.6948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1934.5" cy="1774.2891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1777.4995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1934.5" cy="1787.0938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1790.3042">virtual void visit(const MultiNode* node)</text><ellipse cx="1934.5" cy="1799.8984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1943.5" y="1803.1089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[2bd6b9bd378d282739bad95694e0395c]
+class Lowerer--><rect codeLine="317" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="Lowerer" style="stroke:#A80036;stroke-width:1.5;" width="234" x="946.5" y="237.5"/><ellipse cx="1034.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1037.7188,259.1406 Q1037.1406,259.4375 1036.5,259.5781 Q1035.8594,259.7344 1035.1563,259.7344 Q1032.6563,259.7344 1031.3281,258.0938 Q1030.0156,256.4375 1030.0156,253.3125 Q1030.0156,250.1875 1031.3281,248.5313 Q1032.6563,246.875 1035.1563,246.875 Q1035.8594,246.875 1036.5,247.0313 Q1037.1563,247.1875 1037.7188,247.4844 L1037.7188,250.2031 Q1037.0938,249.625 1036.5,249.3594 Q1035.9063,249.0781 1035.2813,249.0781 Q1033.9375,249.0781 1033.25,250.1563 Q1032.5625,251.2188 1032.5625,253.3125 Q1032.5625,255.4063 1033.25,256.4844 Q1033.9375,257.5469 1035.2813,257.5469 Q1035.9063,257.5469 1036.5,257.2813 Q1037.0938,257 1037.7188,256.4219 L1037.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="49" x="1055.25" y="257.6543">Lowerer</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="269.5" y2="269.5"/><ellipse cx="957.5" cy="280.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="966.5" y="283.7104">std::shared_ptr&lt;LowererImpl&gt; impl;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="290.3047" y2="290.3047"/><!--MD5=[b7b8bc7e8eb8ee18eadc3b8fd556bfb2]
+class LowererImpl--><rect codeLine="320" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImpl" style="stroke:#A80036;stroke-width:1.5;" width="365" x="881" y="490.5"/><ellipse cx="1020.75" cy="506.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1020.8594,501.8438 L1019.7031,506.9219 L1022.0313,506.9219 L1020.8594,501.8438 Z M1019.375,499.6094 L1022.3594,499.6094 L1025.7188,512 L1023.2656,512 L1022.5,508.9375 L1019.2188,508.9375 L1018.4688,512 L1016.0313,512 L1019.375,499.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="77" x="1041.25" y="510.6543">LowererImpl</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="522.5" y2="522.5"/><polygon fill="none" points="892,528.5,896,532.5,892,536.5,888,532.5" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="74" x="901" y="536.7104">class Visitor;</text><polygon fill="none" points="892,541.3047,896,545.3047,892,549.3047,888,545.3047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="111" x="901" y="549.5151">friend class Visitor;</text><polygon fill="none" points="892,554.1094,896,558.1094,892,562.1094,888,558.1094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="185" x="901" y="562.3198">std::shared_ptr&lt;Visitor&gt; visitor;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="568.9141" y2="568.9141"/><polygon fill="#FFFF44" points="892,574.9141,896,578.9141,892,582.9141,888,578.9141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="901" y="583.1245">virtual ir::Stmt lower(IndexStmt stmt);</text><polygon fill="#FFFF44" points="892,587.7188,896,591.7188,892,595.7188,888,591.7188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="211" x="901" y="595.9292">virtual ir::Expr lower(IndexExpr expr);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="608.7339"/><polygon fill="#FFFF44" points="892,613.3281,896,617.3281,892,621.3281,888,617.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="901" y="621.5386">virtual ir::Expr lowerExpr(IndexExpr expr) = 0;</text><polygon fill="#FFFF44" points="892,626.1328,896,630.1328,892,634.1328,888,630.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="267" x="901" y="634.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="647.1479"/><ellipse cx="892" cy="656.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="315" x="901" y="659.9526">virtual ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="339" x="901" y="672.7573">bool assemble, bool compute, bool pack, bool unpack) = 0;</text><!--MD5=[cf3b4bcfbe7bc4015089b336f3e5ed76]
+class LowererImplImperative--><rect codeLine="337" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImplImperative" style="stroke:#A80036;stroke-width:1.5;" width="337" x="691" y="759.5"/><ellipse cx="785.75" cy="775.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M788.7188,781.1406 Q788.1406,781.4375 787.5,781.5781 Q786.8594,781.7344 786.1563,781.7344 Q783.6563,781.7344 782.3281,780.0938 Q781.0156,778.4375 781.0156,775.3125 Q781.0156,772.1875 782.3281,770.5313 Q783.6563,768.875 786.1563,768.875 Q786.8594,768.875 787.5,769.0313 Q788.1563,769.1875 788.7188,769.4844 L788.7188,772.2031 Q788.0938,771.625 787.5,771.3594 Q786.9063,771.0781 786.2813,771.0781 Q784.9375,771.0781 784.25,772.1563 Q783.5625,773.2188 783.5625,775.3125 Q783.5625,777.4063 784.25,778.4844 Q784.9375,779.5469 786.2813,779.5469 Q786.9063,779.5469 787.5,779.2813 Q788.0938,779 788.7188,778.4219 L788.7188,781.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="139" x="806.25" y="779.6543">LowererImplImperative</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="791.5" y2="791.5"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="799.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="70" x="711" y="805.7104">class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="812.3047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="711" y="818.5151">fiend class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="825.1094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="711" y="831.3198">std::shared_ptr&lt;Visitor&gt; visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="837.9141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="711" y="844.1245">bool assemble</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="850.7188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="77" x="711" y="856.9292">bool compute</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="863.5234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="711" y="869.7339">vars a_bunch_of_other_fields</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="876.3281" y2="876.3281"/><polygon fill="#FFFF44" points="702,882.3281,706,886.3281,702,890.3281,698,886.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="238" x="711" y="890.5386">virtual ir::Stmt lowerExpr(IndexExpr expr);</text><polygon fill="#FFFF44" points="702,895.1328,706,899.1328,702,903.1328,698,899.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="243" x="711" y="903.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="715" y="916.1479"/><ellipse cx="702" cy="925.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="711" y="928.9526">ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="311" x="711" y="941.7573">bool assemble, bool compute, bool pack, bool unpack)</text><path d="M581,1201 L581,1256.3984 A0,0 0 0 0 581,1256.3984 L946,1256.3984 A0,0 0 0 0 946,1256.3984 L946,1211 L936,1201 L774.6103,1201 L835.2751,948.5022 L766.6103,1201 L581,1201 A0,0 0 0 0 581,1201 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M936,1201 L936,1211 L946,1211 L936,1201 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="344" x="587" y="1218.0669">Stmt LowererImplImperative::lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="174" x="595" y="1233.1997">return visitor-&gt;lower(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="587" y="1248.3325">}</text><!--MD5=[53bf68ed638bcf4718423098b3d480ea]
+class Visitor--><rect codeLine="362" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="380.9219" id="Visitor" style="stroke:#A80036;stroke-width:1.5;" width="253" x="981" y="1038"/><ellipse cx="1083.75" cy="1054" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1086.7188,1059.6406 Q1086.1406,1059.9375 1085.5,1060.0781 Q1084.8594,1060.2344 1084.1563,1060.2344 Q1081.6563,1060.2344 1080.3281,1058.5938 Q1079.0156,1056.9375 1079.0156,1053.8125 Q1079.0156,1050.6875 1080.3281,1049.0313 Q1081.6563,1047.375 1084.1563,1047.375 Q1084.8594,1047.375 1085.5,1047.5313 Q1086.1563,1047.6875 1086.7188,1047.9844 L1086.7188,1050.7031 Q1086.0938,1050.125 1085.5,1049.8594 Q1084.9063,1049.5781 1084.2813,1049.5781 Q1082.9375,1049.5781 1082.25,1050.6563 Q1081.5625,1051.7188 1081.5625,1053.8125 Q1081.5625,1055.9063 1082.25,1056.9844 Q1082.9375,1058.0469 1084.2813,1058.0469 Q1084.9063,1058.0469 1085.5,1057.7813 Q1086.0938,1057.5 1086.7188,1056.9219 L1086.7188,1059.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="39" x="1104.25" y="1058.1543">Visitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1070" y2="1070"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1078"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="104" x="1001" y="1084.2104">LowererImpl* impl</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1090.8047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="1001" y="1097.0151">Expr expr</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1103.6094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="1001" y="1109.8198">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1116.4141" y2="1116.4141"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1124.4141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1001" y="1130.6245">void visit(const AssignmentNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1137.2188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1001" y="1143.4292">void visit(const YieldNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1150.0234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1001" y="1156.2339">void visit(const ForallNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1162.8281"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1169.0386">void visit(const WhereNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1175.6328"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1181.8433">void visit(const MultiNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1188.4375"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1001" y="1194.6479">void visit(const SuchThatNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1201.2422"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1207.4526">void visit(const SequenceNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1214.0469"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1220.2573">void visit(const AssembleNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1226.8516"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1001" y="1233.062">void visit(const AccessNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1239.6563"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1245.8667">void visit(const LiteralNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1252.4609"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1258.6714">void visit(const NegNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1265.2656"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1001" y="1271.4761">void visit(const AddNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1278.0703"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1284.2808">void visit(const SubNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1290.875"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1001" y="1297.0854">void visit(const MulNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1303.6797"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1001" y="1309.8901">void visit(const DivNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1316.4844"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1001" y="1322.6948">void visit(const SqrtNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1329.2891"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1335.4995">void visit(const CastNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1342.0938"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1001" y="1348.3042">void visit(const CallIntrinsicNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1354.8984"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1001" y="1361.1089">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1005" y="1373.9136"/><ellipse cx="992" cy="1383.5078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1001" y="1386.7183">Visitor(LowererImplImperative* impl)</text><ellipse cx="992" cy="1396.3125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="157" x="1001" y="1399.5229">Stmt lower(IndexStmt stmt)</text><ellipse cx="992" cy="1409.1172" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="152" x="1001" y="1412.3276">Expr lower(IndexExpr expr)</text><path d="M975.5,1587 L975.5,1702.9297 A0,0 0 0 0 975.5,1702.9297 L1239.5,1702.9297 A0,0 0 0 0 1239.5,1702.9297 L1239.5,1597 L1229.5,1587 L1111.5,1587 L1107.5,1419.0758 L1103.5,1587 L975.5,1587 A0,0 0 0 0 975.5,1587 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1229.5,1587 L1229.5,1597 L1239.5,1597 L1229.5,1587 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="191" x="981.5" y="1604.0669">Stmt lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="134" x="989.5" y="1619.1997">this-&gt;stmt = Stmt();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="219" x="989.5" y="1634.3325">impl-&gt;accessibleIterators.scope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="217" x="989.5" y="1649.4653">IndexStmtVisitorStrict::visit(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="235" x="989.5" y="1664.5981">impl-&gt;accessibleIterators.unscope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="116" x="989.5" y="1679.731">return this-&gt;stmt;</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="981.5" y="1694.8638">}</text><!--MD5=[ae51cb2269b8d9d23f4eb16ba4c021c2]
+reverse link Uncopyable to IRNode--><path codeLine="26" d="M820.4411,310.2452 C805.2002,341.1699 780.565,381.9466 747.5,407 C656.8632,475.6756 606.7888,436.4312 503.5,484 C464.9666,501.7462 424.4666,526.9962 393.5999,547.8097 " fill="none" id="Uncopyable-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="814.1208,307.2354,828.901,292.0517,826.8155,313.1384,814.1208,307.2354" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[910d35d02fd37b100f27db676215561a]
+reverse link IRNode to BaseStmtNode--><path codeLine="27" d="M307.5211,639.0848 C270.9373,697.3157 214.3898,787.3234 187.5844,829.9899 " fill="none" id="IRNode-backto-BaseStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="301.6118,635.3322,318.1787,622.1209,313.4664,642.7799,301.6118,635.3322" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d15399140a2ba01317e3af6505b2f237]
+reverse link IRNode to BaseExprNode--><path codeLine="28" d="M348.9536,642.2864 C356.1642,697.705 366.8158,779.5704 372.5311,823.496 " fill="none" id="IRNode-backto-BaseExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="341.9689,642.8569,346.3299,622.1209,355.8519,641.0505,341.9689,642.8569" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6fc58d354eb039aa71a812145e71cc51]
+reverse link BaseStmtNode to StmtNode--><path codeLine="29" d="M167.1367,898.6343 C157.8757,975.7069 139.117,1131.8206 131.1797,1197.8764 " fill="none" id="BaseStmtNode-backto-StmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="160.246,897.3055,169.5821,878.2834,174.146,898.9757,160.246,897.3055" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[b34aab7e28a3dad0196efbcf2402ad4b]
+reverse link BaseExprNode to ExprNode--><path codeLine="30" d="M377.9886,904.6786 C380.3078,983.6383 384.7133,1133.6261 386.6028,1197.9549 " fill="none" id="BaseExprNode-backto-ExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="370.9905,904.8438,377.4002,884.6469,384.9845,904.4327,370.9905,904.8438" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6982bf9bb66925c6ba3afaae707aa75e]
+reverse link IntrusivePtr to IRHandle--><path codeLine="38" d="M613.7823,69.7043 C517.4991,114.1798 345.0472,193.8394 250.6748,237.4323 " fill="none" id="IntrusivePtr-backto-IRHandle" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="610.8857,63.3316,631.9777,61.2994,616.7566,76.0412,610.8857,63.3316" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[59037bcbfab734c29d5f29f29345990a]
+reverse link IRHandle to Expr--><path codeLine="39" d="M161.5974,316.9128 C141.1096,360.6844 110.2697,426.612 83.5,484 C71.2362,510.2908 57.2078,540.4477 47.6932,560.9138 " fill="none" id="IRHandle-backto-Expr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="155.2791,313.8992,170.0982,298.7534,167.9586,319.8347,155.2791,313.8992" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2e3a655b6bb23e034db67107221f412a]
+reverse link IRHandle to Stmt--><path codeLine="40" d="M176.0078,318.793 C164.419,388.1073 144.1107,509.5737 135.5197,560.9576 " fill="none" id="IRHandle-backto-Stmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="169.1677,317.2552,179.37,298.6833,182.976,319.5639,169.1677,317.2552" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2a0bead0c725d8d45864c262e97ce783]
+reverse link IRHandle to IRNode--><path codeLine="42" d="M205.5204,310.4425 C236.4803,372.9538 293.8347,488.7585 323.1741,547.998 " fill="none" id="IRHandle-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="199.6965,298.6833,198.7749,305.8353,205.0223,309.4367,205.9439,302.2848,199.6965,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="273.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="193.9008" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="311.0578" y="536.9081">1</text><!--MD5=[58cf07e15c029e621d8edfba03fa64a2]
+reverse link IRVisitorStrict to IRVisitor--><path codeLine="94" d="M2743.085,79.3089 C2719.9753,94.626 2693.8639,112.3013 2670.5,129 C2620.0401,165.0649 2563.4947,208.7917 2527.0182,237.4501 " fill="none" id="IRVisitorStrict-backto-IRVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2739.5252,73.2715,2760.0768,68.1107,2747.2291,84.9613,2739.5252,73.2715" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[75bfe22e38cc091da0fbb6f74406d06e]
+reverse link IRVisitorStrict to IRPrinter--><path codeLine="95" d="M2880.6802,78.0481 C2916.73,98.0228 2962.2732,123.2577 3008.2876,148.7537 " fill="none" id="IRVisitorStrict-backto-IRPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2876.843,83.9248,2862.7416,68.1086,2883.6283,71.679,2876.843,83.9248" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[85c48ab67ed60544567d89d58f94870d]
+reverse link IRVisitorStrict to IRRewriter--><path codeLine="96" d="M2807.5,88.3035 C2807.5,120.7899 2807.5,163.3447 2807.5,198.8875 " fill="none" id="IRVisitorStrict-backto-IRRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2800.5001,88.1087,2807.5,68.1086,2814.5001,88.1086,2800.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4028845cddf1255230d8af57eef922a1]
+reverse link IRVisitor to IRVerifier--><path codeLine="97" d="M2464.2278,317.0083 C2430.0567,386.0033 2369.0761,509.1298 2343.4074,560.9576 " fill="none" id="IRVisitor-backto-IRVerifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2458.1543,313.4989,2473.3035,298.6833,2470.6999,319.7124,2458.1543,313.4989" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a94ddfc410010d5e6723affae8cc10d]
+reverse link IRRewriter to ExpressionSimplifier--><path codeLine="102" d="M2720.1218,351.0244 C2678.6358,390.8516 2628.5995,439.4826 2584.5,484 C2558.962,509.78 2530.3923,540.0247 2511.1115,560.6511 " fill="none" id="IRRewriter-backto-ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2715.4205,345.8343,2734.7035,337.0499,2725.1073,355.9421,2715.4205,345.8343" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[949c6b7bfd4235ac14da91ff0f1abad4]
+reverse link IRRewriter to RemoveRedundantStatements--><path codeLine="109" d="M2781.238,356.5644 C2760.6434,426.0163 2733.4041,517.8765 2720.6551,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2774.5903,354.3599,2786.9875,337.1753,2788.0127,358.3401,2774.5903,354.3599" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[429f4895fdac785b3cbc97ff72ee188d]
+reverse link IRRewriter to RemoveRedundantLoops--><path codeLine="110" d="M2847.8795,355.6733 C2879.9145,425.2286 2922.5003,517.6924 2942.3867,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2841.3685,358.2695,2839.3599,337.1753,2854.0847,352.4128,2841.3685,358.2695" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[68110a92c5ea6b91d77066a3850c99de]
+reverse link IRRewriter to RemoveDuplicateBody--><path codeLine="111" d="M2908.6453,349.9792 C2956.6158,389.6232 3014.286,438.4014 3064.5,484 C3092.4965,509.4232 3123.3012,539.9945 3143.8609,560.7956 " fill="none" id="IRRewriter-backto-RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2904.088,355.2943,2893.1039,337.1739,2912.9906,344.4895,2904.088,355.2943" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ae50f65b4cbe15dbcd3dbdd752b04bad]
+reverse link IRPrinter to CodeGen--><path codeLine="120" d="M3277.8266,425.9959 C3295.9202,478.6165 3314.0392,531.3112 3324.2066,560.8807 " fill="none" id="IRPrinter-backto-CodeGen" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3271.1927,428.2301,3271.309,407.0408,3284.4319,423.6778,3271.1927,428.2301" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[319cfb8dc735ef16cddbcdc70ff637f3]
+reverse link CodeGen to CodeGen_C--><path codeLine="121" d="M3308.9871,626.8873 C3276.9808,683.9051 3220.7457,784.0854 3195.0269,829.9024 " fill="none" id="CodeGen-backto-CodeGen_C" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3302.9395,623.3601,3318.8334,609.3464,3315.1476,630.213,3302.9395,623.3601" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[aa96aa54faae34d6ca9ace804ab720b5]
+reverse link CodeGen to CodeGen_ISPC--><path codeLine="122" d="M3357.3479,626.516 C3391.4351,683.4693 3451.5956,783.9858 3479.0773,829.9024 " fill="none" id="CodeGen-backto-CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3351.3364,630.1024,3347.0716,609.3464,3363.3492,622.9126,3351.3364,630.1024" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[442ed43531516b32839cb3faf9b2f28c]
+reverse link CodeGen to CodeGen_CUDA--><path codeLine="123" d="M3332.5,629.5199 C3332.5,686.9415 3332.5,784.7827 3332.5,829.9024 " fill="none" id="CodeGen-backto-CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3325.5001,629.3464,3332.5,609.3464,3339.5001,629.3464,3325.5001,629.3464" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[e163d187ef5eaf663efea2335f5ab426]
+reverse link Manageable to IndexStmtNode--><path codeLine="135" d="M1257.9638,310.6351 C1244.4457,339.9959 1223.6542,378.6672 1197.5,407 C1172.0822,434.5351 1162.6727,441.0723 1127.5,454 C1016.6611,494.7386 975.8786,447.724 863.5,484 C813.372,500.1814 761.462,531.0819 726.345,554.4621 " fill="none" id="Manageable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1251.6817,307.531,1266.1369,292.0374,1264.4987,313.1636,1251.6817,307.531" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d7ac6dc268d898e1c32d87af860f66f6]
+reverse link Uncopyable to IndexStmtNode--><path codeLine="136" d="M817.879,310.1732 C786.2624,374.8344 726.5439,496.9683 698.6147,554.088 " fill="none" id="Uncopyable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="811.6488,306.9791,826.7226,292.0867,824.2258,313.1288,811.6488,306.9791" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[648f85c83359671aa27d8bdab5afe684]
+reverse link Manageable to IndexExprNode--><path codeLine="137" d="M1297.8505,310.1732 C1332.1189,374.8344 1396.846,496.9683 1427.1176,554.088 " fill="none" id="Manageable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1291.4456,313.0363,1288.2652,292.0867,1303.8158,306.4805,1291.4456,313.0363" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81b6c5025c45f10b02aba636448b0629]
+reverse link Uncopyable to IndexExprNode--><path codeLine="138" d="M854.8121,310.9496 C869.2284,342.7749 893.4553,384.3933 928.5,407 C1003.6621,455.4857 1040.645,415.6362 1127.5,437 C1189.6011,452.2751 1205.1592,457.804 1263.5,484 C1309.4283,504.6226 1359.1614,533.0311 1394.6887,554.4675 " fill="none" id="Uncopyable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="848.2286,313.3522,846.9174,292.2032,861.1311,307.9185,848.2286,313.3522" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a1ee9a433488db7e06ed4f91810d452]
+reverse link IntrusivePtr to IndexStmt--><path codeLine="143" d="M683.5,88.2338 C683.5,136.5801 683.5,207.2721 683.5,243.9383 " fill="none" id="IntrusivePtr-backto-IndexStmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="676.5001,88.1087,683.5,68.1086,690.5001,88.1086,676.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[be16824624a74e7da7bb67b6f377f820]
+reverse link IndexStmt to IndexStmtNode--><path codeLine="144" d="M683.5,305.2739 C683.5,368.4736 683.5,495.4911 683.5,554.088 " fill="none" id="IndexStmt-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="683.5,292.0867,679.5,298.0867,683.5,304.0867,687.5,298.0867,683.5,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.475" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.675" y="543.2663">1</text><!--MD5=[8120730d6b32269f0970ddfe15f91d14]
+reverse link IntrusivePtr to IndexExpr--><path codeLine="145" d="M755.3761,39.2555 C906.9425,44.2375 1253.1256,62.7483 1347.5,129 C1386.7641,156.5638 1406.5439,211.9266 1415.072,243.8366 " fill="none" id="IntrusivePtr-backto-IndexExpr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="755.1028,46.2505,735.3276,38.6387,755.5334,32.2571,755.1028,46.2505" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[1972b08ae664b2d2310d03537cd7a5e1]
+reverse link IndexExpr to IndexExprNode--><path codeLine="146" d="M1423.2044,305.2739 C1427.7899,368.4736 1437.0057,495.4911 1441.2572,554.088 " fill="none" id="IndexExpr-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1422.2476,292.0867,1418.6923,298.3605,1423.1161,304.0552,1426.6714,297.7815,1422.2476,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1413.6704" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1431.9696" y="543.2663">1</text><!--MD5=[b26c9c8d3a5b29d8271f45d68507eadd]
+reverse link IndexExprVisitorStrict to IndexNotationVisitorStrict--><path codeLine="295" d="M1632.9995,699.6227 C1626.9972,705.0539 1621.1388,710.5255 1615.5,716 C1577.5728,752.8222 1539.4548,801.3814 1518.0255,829.9928 " fill="none" id="IndexExprVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1628.5467,694.2161,1648.1726,686.2269,1637.8124,704.7112,1628.5467,694.2161" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[7234130c38761532599f2f7b36911e2f]
+reverse link IndexStmtVisitorStrict to IndexNotationVisitorStrict--><path codeLine="296" d="M1966.5197,675.467 C1958.1897,679.3054 1949.8174,682.8546 1941.5,686 C1803.2176,738.2949 1746.3714,680.1665 1614.5,747 C1575.0345,767.0014 1539.609,804.9113 1519.0786,829.8154 " fill="none" id="IndexStmtVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1963.6036,669.1012,1984.6506,666.6464,1969.7282,681.6905,1963.6036,669.1012" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d405f4886b031ffa84d6c62850f61924]
+reverse link IndexNotationVisitorStrict to IndexNotationVisitor--><path codeLine="297" d="M1526.3846,895.2501 C1552.042,936.1383 1593.0382,1001.4705 1631.9613,1063.499 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1520.4391,898.945,1515.7379,878.2834,1532.2978,891.5037,1520.4391,898.945" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[9298265fb9c2b46a51654cec95663d88]
+reverse link IndexNotationVisitorStrict to IndexNotationPrinter--><path codeLine="298" d="M1491.8998,898.1203 C1484.7524,934.7873 1474.0689,989.595 1463.4478,1044.0828 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1485.0692,896.5747,1495.7665,878.2834,1498.8106,899.2533,1485.0692,896.5747" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2164c356666f1a365c7584220eeab5ce]
+reverse link IndexNotationVisitor to Matcher--><path codeLine="299" d="M1735.5,1413.8401 C1735.5,1494.6098 1735.5,1580.0329 1735.5,1620.7139 " fill="none" id="IndexNotationVisitor-backto-Matcher" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1728.5001,1413.7101,1735.5,1393.71,1742.5001,1413.71,1728.5001,1413.7101" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81658007f5a451634c394e4129ce2328]
+reverse link IndexExprVisitorStrict to IndexExprRewriterStrict--><path codeLine="301" d="M1782.5,706.5527 C1782.5,720.0234 1782.5,733.669 1782.5,746.9421 " fill="none" id="IndexExprVisitorStrict-backto-IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1775.5001,706.3141,1782.5,686.3141,1789.5001,706.314,1775.5001,706.3141" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ad82d38a65963623a4dbc072e2395c0a]
+reverse link IndexStmtVisitorStrict to IndexStmtRewriterStrict--><path codeLine="302" d="M2113.9155,687.085 C2114.78,712.9253 2115.7014,740.463 2116.5412,765.5657 " fill="none" id="IndexStmtVisitorStrict-backto-IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2106.9097,687.0274,2113.237,666.8045,2120.9019,686.5592,2106.9097,687.0274" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[17cef803f955afc58233a06ff8ed6ced]
+reverse link IndexExprRewriterStrict to IndexNotationRewriterStrict--><path codeLine="303" d="M1876.8558,977.1228 C1939.4095,1058.7478 2016.1337,1158.8635 2050.856,1204.1719 " fill="none" id="IndexExprRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1871.16,981.1985,1864.5506,961.066,1882.2722,972.6826,1871.16,981.1985" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f1a4c69017cc3acf68d02aa5998e72c7]
+reverse link IndexStmtRewriterStrict to IndexNotationRewriterStrict--><path codeLine="304" d="M2105.0534,962.205 C2093.838,1046.2084 2079.1518,1156.2078 2072.7352,1204.2683 " fill="none" id="IndexStmtRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2098.1288,961.1748,2107.714,942.277,2112.0056,963.0275,2098.1288,961.1748" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ea60607216d1741e9a004dc3b2ad9bc4]
+reverse link IndexNotationRewriterStrict to IndexNotationRewriter--><path codeLine="306" d="M2069.5,1272.8869 C2069.5,1322.2639 2069.5,1404.9692 2069.5,1479.8852 " fill="none" id="IndexNotationRewriterStrict-backto-IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2062.5001,1272.6931,2069.5,1252.6931,2076.5001,1272.693,2062.5001,1272.6931" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ce0e28a833df6d388c2232cca949e33a]
+reverse link Uncopyable to LowererImpl--><path codeLine="357" d="M864.1964,309.4349 C881.7851,337.3534 905.9227,374.8161 928.5,407 C947.767,434.4652 969.4267,463.6921 989.6625,490.3547 " fill="none" id="Uncopyable-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="858.0963,312.8828,853.4161,292.2165,869.9625,305.4535,858.0963,312.8828" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[595f18298affe1361dad6c88d07b3ae8]
+reverse link Lowerer to LowererImpl--><path codeLine="358" d="M1063.5,311.7072 C1063.5,357.6007 1063.5,431.1895 1063.5,490.4492 " fill="none" id="Lowerer-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1063.5,298.6833,1059.5,304.6833,1063.5,310.6833,1067.5,304.6833,1063.5,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1064.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1054.7125" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1055.1813" y="479.3951">1</text><!--MD5=[76c844881f8770258bad5028aba6ca47]
+reverse link IndexNotationVisitorStrict to Visitor--><path codeLine="396" d="M1453.9953,890.688 C1409.0705,926.7585 1339.7531,984.0918 1283.5,1038 C1267.164,1053.655 1250.4599,1070.5041 1234.1618,1087.4753 " fill="none" id="IndexNotationVisitorStrict-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1449.6605,885.1915,1469.657,878.1815,1458.3965,896.1315,1449.6605,885.1915" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f3857c0b64c12f6416059a5dcd8ca3ae]
+reverse link LowererImpl to Visitor--><path codeLine="397" d="M1070.9927,694.5811 C1077.4133,788.4824 1086.8084,925.8861 1094.4598,1037.7864 " fill="none" id="LowererImpl-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="1070.5145" cy="687.5863" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1071.0602" x2="1069.9687" y1="695.5676" y2="679.6049"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1062.5331" x2="1078.4958" y1="688.132" y2="687.0405"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1090.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1062.1577" y="699.5">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1085.7942" y="1027.1147">1</text><!--MD5=[44db9126e684c102525c4f7b853b119b]
+reverse link Visitor to LowererImpl--><path codeLine="398" d="M1157.8948,1024.7712 C1171.4272,938.063 1176.6901,836.4934 1153.5,747 C1147.5682,724.1083 1137.6236,701.0075 1126.4596,679.7046 " fill="none" id="Visitor-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1155.7945,1037.728,1160.703,1032.4454,1157.7146,1025.8826,1152.8061,1031.1653,1155.7945,1037.728" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1170.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1149.5248" y="1027.054">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1122.7894" y="699.2875">1</text><!--MD5=[7cec337d4232ea69c4a4e115b7f1c391]
+reverse link LowererImpl to LowererImplImperative--><path codeLine="400" d="M979.2866,696.0461 C963.4034,716.9902 946.9263,738.7172 931.4609,759.1104 " fill="none" id="LowererImpl-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="973.9145,691.5453,991.5772,679.8393,985.0696,700.005,973.9145,691.5453" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d416585c3fdacb879af8752baa2327bb]
+reverse link LowererImplImperative to Visitor--><path codeLine="401" d="M879.1422,963.0891 C885.3942,979.2794 893.6358,994.8993 904.5,1008 C923.2784,1030.6441 940.9429,1019.1172 963.5,1038 C969.3812,1042.9232 975.1644,1048.1297 980.833,1053.5529 " fill="none" id="LowererImplImperative-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="876.7795" cy="956.3169" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="879.4147" x2="874.1442" y1="963.8704" y2="948.7634"/><line style="stroke:#A80036;stroke-width:1.0;" x1="869.226" x2="884.333" y1="958.9522" y2="953.6816"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="905.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="866.9259" y="968.3506">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="964.8705" y="1039.5084">1</text><!--MD5=[cd8dd7ca9f18b6220f591f64794d3d39]
+reverse link Visitor to LowererImplImperative--><path codeLine="402" d="M988.0443,1026.5409 C980.2164,1014.4253 972.331,1002.497 964.5,991 C955.0529,977.1302 944.7145,962.7728 934.3268,948.8048 " fill="none" id="Visitor-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="995.2137,1037.7132,995.3397,1030.5032,988.7328,1027.6138,988.6068,1034.8238,995.2137,1037.7132" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="974.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="982.6713" y="1027.0386">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="931.5281" y="968.395">1</text><!--MD5=[59ff6f047f3ce21caa7eb37a22acd23c]
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part - - convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+
+
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+class Visitor {
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+PlantUML version 1.2021.7(Sun May 23 08:40:07 EDT 2021)
+(GPL source distribution)
+Java Runtime: OpenJDK Runtime Environment
+JVM: OpenJDK 64-Bit Server VM
+Default Encoding: ANSI_X3.4-1968
+Language: en
+Country: US
+--></g></svg>
\ No newline at end of file
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
index f0c09d98a..64c8b3f02 100644
--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -229,6 +229,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
   return ret.str();
 }
 
+string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op,
+                            bool is_output_prop) {
+  stringstream ret;
+  ret << "";
+
+  auto tensor = op->tensor.as<Var>();
+  if (op->property == TensorProperty::Values) {
+    // for the values, it's in the last slot
+    ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]";
+    return ret.str();
+  } else if (op->property == TensorProperty::ValuesSize) {
+    ret << "int32 " << varname;
+    return ret.str();
+  }
+
+  // for a Dense level, nnz is an int
+  // for a Fixed level, ptr is an int
+  // all others are int*
+  if (op->property == TensorProperty::Dimension) {
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "int ";
+    }
+    ret << varname;
+    
+  } else {
+    taco_iassert(op->property == TensorProperty::Indices);
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "uniform int ";
+    }
+    ret << varname << "[]";
+  }
+
+  return ret.str();
+}
+
 string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
                             bool is_output_prop) {
   stringstream ret;
@@ -310,13 +353,9 @@ string CodeGen::pointTensorProperty(std::string varname) {
   return ret.str();
 }
 
-// helper to print declarations
-string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
-                           vector<Expr> inputs, vector<Expr> outputs) {
-  stringstream ret;
-  unordered_set<string> propsAlreadyGenerated;
-
-  vector<const GetProperty*> sortedProps;
+void CodeGen::getSortedProps(map<Expr, string, ExprCompare> &varMap,
+              vector<const GetProperty*> &sortedProps, vector<Expr> &inputs,
+              vector<Expr> &outputs) {
 
   for (auto const& p: varMap) {
     if (p.first.as<GetProperty>())
@@ -355,6 +394,17 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
          return a->index < b->index;
        });
 
+}
+
+// helper to print declarations
+string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
+                           vector<Expr> inputs, vector<Expr> outputs) {
+  stringstream ret;
+  unordered_set<string> propsAlreadyGenerated;
+
+  vector<const GetProperty*> sortedProps;
+  getSortedProps(varMap, sortedProps, inputs, outputs);
+
   for (auto prop: sortedProps) {
     bool isOutputProp = (find(outputs.begin(), outputs.end(),
                               prop->tensor) != outputs.end());
@@ -375,7 +425,6 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
   return ret.str();
 }
 
-
 string CodeGen::printPack(map<tuple<Expr, TensorProperty, int, int>,
         string> outputProperties, vector<Expr> outputs) {
   stringstream ret;
diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h
index cc25c80d6..48540904e 100644
--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -16,7 +16,8 @@ class CodeGen : public IRPrinter {
   enum CodeGenType { C, CUDA };
 
   CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {};
-  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {};
+  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) 
+    : IRPrinter(stream, color, simplify), codeGenType(type) {};
   /// Initialize the default code generator
   static std::shared_ptr<CodeGen> init_default(std::ostream &dest, OutputKind outputKind);
 
@@ -26,6 +27,9 @@ class CodeGen : public IRPrinter {
 protected:
   static bool checkForAlloc(const Function *func);
   static int countYields(const Function *func);
+  void getSortedProps(std::map<Expr, std::string, ExprCompare> &varMap,
+              std::vector<const GetProperty*> &sortedProps, std::vector<Expr> &inputs,
+              std::vector<Expr> &outputs);
 
   static std::string printCType(Datatype type, bool is_ptr);
   static std::string printCUDAType(Datatype type, bool is_ptr);
@@ -52,6 +56,10 @@ class CodeGen : public IRPrinter {
   std::string printFuncName(const Function *func, 
           std::map<Expr, std::string, ExprCompare> inputMap={}, 
           std::map<Expr, std::string, ExprCompare> outputMap={});
+  
+  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
+  std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op,
+                              bool is_output_prop); 
 
   void resetUniqueNameCounters();
   std::string genUniqueName(std::string name);
@@ -61,9 +69,8 @@ class CodeGen : public IRPrinter {
 private:
   virtual std::string restrictKeyword() const { return ""; }
 
-  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
   std::string unpackTensorProperty(std::string varname, const GetProperty* op,
-                              bool is_output_prop);
+                              bool is_output_prop); 
   std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
                             int mode, int index);
   std::string pointTensorProperty(std::string varname);
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 2ade9d7f6..d55adbe58 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -34,6 +34,7 @@ const string cHeaders =
   "#include <math.h>\n"
   "#include <complex.h>\n"
   "#include <string.h>\n"
+  "#include <omp.h>\n"
   "#if _OPENMP\n"
   "#include <omp.h>\n"
   "#endif\n"
@@ -308,6 +309,7 @@ void CodeGen_C::visit(const Function* func) {
   // output body
   print(func->body);
 
+
   // output repack only if we allocated memory
   if (checkForAlloc(func))
     out << endl << printPack(varFinder.outputProperties, func->outputs);
@@ -403,6 +405,7 @@ static string getAtomicPragma() {
 // Docs for vectorization pragmas:
 // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
 void CodeGen_C::visit(const For* op) {
+
   switch (op->kind) {
     case LoopKind::Vectorized:
       doIndent();
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 55c9d01a8..37bda6046 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -28,23 +28,24 @@ class CodeGen_C : public CodeGen {
 protected:
   using IRPrinter::visit;
 
-  void visit(const Function*);
-  void visit(const VarDecl*);
-  void visit(const Yield*);
-  void visit(const Var*);
-  void visit(const For*);
-  void visit(const While*);
-  void visit(const GetProperty*);
-  void visit(const Min*);
-  void visit(const Max*);
-  void visit(const Allocate*);
-  void visit(const Sqrt*);
-  void visit(const Store*);
-  void visit(const Assign*);
+  virtual void visit(const Function*);
+  virtual void visit(const VarDecl*);
+  virtual void visit(const Yield*);
+  virtual void visit(const Var*);
+  virtual void visit(const For*);
+  virtual void visit(const While*);
+  virtual void visit(const GetProperty*);
+  virtual void visit(const Min*);
+  virtual void visit(const Max*);
+  virtual void visit(const Allocate*);
+  virtual void visit(const Sqrt*);
+  virtual void visit(const Store*);
+  virtual void visit(const Assign*);
 
   std::map<Expr, std::string, ExprCompare> varMap;
   std::vector<Expr> localVars;
   std::ostream &out;
+  int count = 0;
   
   OutputKind outputKind;
 
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index bd0f487b1..c0192f243 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -42,6 +42,7 @@ void Module::addFunction(Stmt func) {
 
 void Module::compileToSource(string path, string prefix) {
   if (!moduleFromUserSource) {
+    std::cout << "module not from user source\n";
   
     // create a codegen instance and add all the funcs
     bool didGenRuntime = false;
@@ -109,6 +110,7 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
 } // anonymous namespace
 
 string Module::compile() {
+  std::cout << "Module::compile\n";
   string prefix = tmpdir+libname;
   string fullpath = prefix + ".so";
   
@@ -137,12 +139,24 @@ string Module::compile() {
   string cmd = cc + " " + cflags + " " +
     prefix + file_ending + " " + shims_file + " " + 
     "-o " + fullpath + " -lm";
+  std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl;
+  std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl;
+  std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl;
+  std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl;
+  std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
 
   // open the output file & write out the source
   compileToSource(tmpdir, libname);
+
   
   // write out the shims
   writeShims(funcs, tmpdir, libname);
+  for (auto &statement : funcs) {
+    std::cout << "----- statement --------" << std::endl;
+    // std::cout << statement;
+    std::cout << std::endl;
+  }
+  std::cout << tmpdir << std::endl << libname << std::endl;
   
   // now compile it
   int err = system(cmd.data());
@@ -168,10 +182,61 @@ string Module::getSource() {
   return source.str();
 }
 
+void* Module::getFuncPtr(std::string& sofile, std::string name) {
+  std::cout << "opening shared object 1\n";
+  if (so_lib_handle) {
+    dlclose(so_lib_handle);
+  }
+  std::cout << "opening shared object 2\n";
+  so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL);
+  std::cout << "opening shared object : " << sofile << std::endl;
+  return dlsym(so_lib_handle, name.data());
+}
+
 void* Module::getFuncPtr(std::string name) {
   return dlsym(lib_handle, name.data());
 }
 
+int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) {
+  typedef int (*fnptr_t)(void**);
+  static_assert(sizeof(void*) == sizeof(fnptr_t),
+    "Unable to cast dlsym() returned void pointer to function pointer");
+  void* v_func_ptr = getFuncPtr(sofile, name);
+  fnptr_t func_ptr;
+  *reinterpret_cast<void**>(&func_ptr) = v_func_ptr;
+
+#if USE_OPENMP
+  omp_sched_t existingSched;
+  ParallelSchedule tacoSched;
+  int existingChunkSize, tacoChunkSize;
+  int existingNumThreads = omp_get_max_threads();
+  omp_get_schedule(&existingSched, &existingChunkSize);
+  taco_get_parallel_schedule(&tacoSched, &tacoChunkSize);
+  switch (tacoSched) {
+    case ParallelSchedule::Static:
+      omp_set_schedule(omp_sched_static, tacoChunkSize);
+      break;
+    case ParallelSchedule::Dynamic:
+      omp_set_schedule(omp_sched_dynamic, tacoChunkSize);
+      break;
+    default:
+      break;
+  }
+  omp_set_num_threads(taco_get_num_threads());
+#endif
+
+  std::cout << "calling the function\n";
+  int ret = func_ptr(args);
+  std::cout << "function call completed\n";
+
+#if USE_OPENMP
+  omp_set_schedule(existingSched, existingChunkSize);
+  omp_set_num_threads(existingNumThreads);
+#endif
+
+  return ret;
+}
+
 int Module::callFuncPackedRaw(std::string name, void** args) {
   typedef int (*fnptr_t)(void**);
   static_assert(sizeof(void*) == sizeof(fnptr_t),
@@ -200,7 +265,9 @@ int Module::callFuncPackedRaw(std::string name, void** args) {
   omp_set_num_threads(taco_get_num_threads());
 #endif
 
+  std::cout << "calling the function\n";
   int ret = func_ptr(args);
+  std::cout << "function call completed\n";
 
 #if USE_OPENMP
   omp_set_schedule(existingSched, existingChunkSize);
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 51fb8770c..2e26460c7 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) {
   return isConcrete;
 }
 
+// make reduction notation
 Assignment makeReductionNotation(Assignment assignment) {
   IndexExpr expr = assignment.getRhs();
   std::vector<IndexVar> free = assignment.getLhs().getIndexVars();
@@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) {
   return makeReductionNotation(to<Assignment>(stmt));
 }
 
+// make concrete notation
 IndexStmt makeConcreteNotation(IndexStmt stmt) {
+  // std::cout << "concrete notation original assignment: " << stmt << std::endl;
+
   std::string reason;
   taco_iassert(isReductionNotation(stmt, &reason))
       << "Not reduction notation: " << stmt << std::endl << reason;
@@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
 
   // Free variables and reductions covering the whole rhs become top level loops
   vector<IndexVar> freeVars = to<Assignment>(stmt).getFreeVars();
+  std::cout << "free vars: " << freeVars << std::endl;
 
   struct RemoveTopLevelReductions : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
         topLevelReductions.push_back(reduction.getVar());
         rhs = reduction.getExpr();
       }
+      // std::cout << "top level reductions: " << topLevelReductions << std::endl;
 
       if (rhs != node->rhs) {
-        stmt = Assignment(node->lhs, rhs, Add());
+        stmt = Assignment(node->lhs, rhs, Add()); // write with add
+        int idx = 0;
         for (auto& i : util::reverse(topLevelReductions)) {
+          std::cout << idx << ": " << stmt << std::endl;
+          idx++;
           stmt = forall(i, stmt);
         }
+        std::cout << idx << ": " << stmt << std::endl;
       }
       else {
         stmt = node;
@@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
     }
   };
   stmt = RemoveTopLevelReductions().rewrite(stmt);
+  // std::cout << "after remove top level reductions: " << stmt << std::endl;
 
+  // now we form the stmt in reverse order of freeVars
+  int idx = 0;
   for (auto& i : util::reverse(freeVars)) {
+    std::cout << idx << ": " << stmt << std::endl;
     stmt = forall(i, stmt);
+    idx++;
   }
+  std::cout << idx << ": " << stmt << std::endl;
 
+  std::cout << "replacing reductions with whereas statements\n";
   // Replace other reductions with where and forall statements
   struct ReplaceReductionsWithWheres : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp
index 0b41615ad..d7ee998ae 100644
--- a/src/index_notation/index_notation_printer.cpp
+++ b/src/index_notation/index_notation_printer.cpp
@@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) {
 void IndexNotationPrinter::visit(const ForallNode* op) {
   os << "forall(" << op->indexVar << ", ";
   op->stmt.accept(this);
-  if (op->parallel_unit != ParallelUnit::NotParallel) {
+  // if (op->parallel_unit != ParallelUnit::NotParallel) {
     os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy];
-  }
+  // }
   os << ")";
 }
 
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 47fc1dd55..3846da6a8 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1,9 +1,16 @@
 #include "taco/index_notation/transformations.h"
 
+#include "lower/iteration_graph.h"
+#include "lower/tensor_path.h"
+#include "taco/cuda.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/index_notation_nodes_abstract.h"
 #include "taco/index_notation/index_notation_rewriter.h"
 #include "taco/index_notation/index_notation_nodes.h"
+#include "taco/index_notation/index_notation_printer.h"
 #include "taco/error/error_messages.h"
+#include "taco/index_notation/intrinsic.h"
+#include "taco/type.h"
 #include "taco/util/collections.h"
 #include "taco/lower/iterator.h"
 #include "taco/lower/merge_lattice.h"
@@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
         IndexExpr e = precompute.getExpr();
         IndexVar iw = precompute.getiw();
 
+        // these lines of code looks interesting when creating the producer consumer relationship
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
         IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), 
                                                    assign.getOperator()));
@@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
     std::string reason = "";
 
     IndexStmt rewriteParallel(IndexStmt stmt) {
+      std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       provGraph = ProvenanceGraph(stmt);
+      std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
 
       const auto reductionVars = getReductionVars(stmt);
       reductionIndexVars.clear();
@@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
       tensorVars = createIRTensorVars(stmt);
 
       assembledByUngroupedInsert.clear();
+      std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
       for (const auto& result : getAssembledByUngroupedInsertion(stmt)) {
         assembledByUngroupedInsert.push_back(tensorVars[result]);
       }
 
+      std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       return rewrite(stmt);
     }
 
     void visit(const ForallNode* node) {
+      std::cout << "transformations.cpp void visit(const ForallNode* node)\n";
+      std::cout << "node: \n" << node << std::endl;
       Forall foralli(node);
+      std::cout << "foralli: \n" << foralli << std::endl;
+      std::cout << "before stmt update stmt: \n" << stmt << std::endl;
       IndexVar i = parallelize.geti();
 
       definedIndexVars.insert(foralli.getIndexVar());
@@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         Iterators iterators(foralli, tensorVars);
         MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, 
                                                   definedIndexVars);
+        std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl;
 
         // Precondition 2: No coiteration of modes (i.e., merge lattice has 
         //                 only one iterator)
@@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         MergeLattice underivedLattice = MergeLattice::make(underivedForall, 
                                                            iterators, provGraph, 
                                                            definedIndexVars);
+        std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl;
 
         // Precondition 3: Every result iterator must have insert capability
         for (Iterator iterator : underivedLattice.results()) {
@@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
             // build consumer that writes from temporary to output, mark consumer as parallel reduction
             ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction;
             if (should_use_CUDA_codegen()) {
+              std::cout << "should_use_CUDA_codegen() true\n";
               if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
                 reductionUnit = ParallelUnit::GPUWarpReduction;
               }
@@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
                 reductionUnit = ParallelUnit::GPUBlockReduction;
               }
             }
+            else {
+              std::cout << "should_use_CUDA_codegen() false\n";
+            }
             IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction);
             precomputed_stmt = where(consumer, producer);
           }
@@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
           return;
         }
 
-
+        std::cout << "updated stmt: \n";
         stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor());
+        std::cout << stmt << std::endl;
         return;
       }
 
@@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os,
 
 IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
   // get outer ForAll
+  std::cout << "get outer ForAll ----------------- \n";
   Forall forall;
   bool matched = false;
   match(stmt,
@@ -1216,6 +1242,7 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
     return parallelized256;
   }
   else {
+    std::cout << "outer loop parallelization for CPU codgen index statement\n";
     IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason);
     if (parallelized == IndexStmt()) {
       // can't parallelize
@@ -1274,6 +1301,7 @@ static vector<IndexVar>
 topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
                   map<IndexVar,multiset<IndexVar>> softDeps,
                   vector<IndexVar> originalOrder) {
+  std::cout << "originalOrder: " << std::endl;
   vector<IndexVar> sortedVars;
   unsigned long countVars = originalOrder.size();
   while (sortedVars.size() < countVars) {
@@ -1295,6 +1323,9 @@ topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
     }
 
     // No free var found there is a cycle
+    std::cout << "this is where the assert fails\n";
+    std::cout << "freeVarPos: " << freeVarPos << std::endl;
+    std::cout << "limit: " << std::numeric_limits<size_t>::max() << std::endl;
     taco_iassert(freeVarPos != std::numeric_limits<size_t>::max())
         << "Cycles in iteration graphs must be resolved, through transpose, "
         << "before the expression is passed to the topological sorting "
@@ -1320,8 +1351,674 @@ topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
   return sortedVars;
 }
 
+bool checkFromBack(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from back function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar lastVisitedIndexVar = resultIndexesVisited.back();
+
+  std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl;
+
+  bool onlyLastTensorContainLastIndexOfOutput = true;
+  bool fissionFromBack = false;
+
+  // check from the back
+  for (unsigned long i=0; i<tensorPaths.size()-1; i++) { // change tensor paths to recursively use the functionality
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    // if (i < tensorPaths.size()-1) { 
+      // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << lastVisitedIndexVar << endl;
+        if (index == lastVisitedIndexVar) {
+          onlyLastTensorContainLastIndexOfOutput = false;
+        }
+      }
+    // }
+  }
+
+  if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.back();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the back\n";
+    auto lastTensorLastVisited = indexesVisited.back();
+    cout << "last index last visited " << lastTensorLastVisited << endl;
+
+    if (lastTensorLastVisited == lastVisitedIndexVar) {
+      cout << "we can diffuse from the back\n";
+      fissionFromBack = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index
+          std::cout << "producer vars: " << indexVar << std::endl;
+          producerVars.push_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      // // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "resultIndexesVisited: " << resultIndexesVisited[j] << std::endl;
+      //   modifiedResultIndexesAccessed.push_back(resultIndexesVisited[j]);
+      // }
+      std::cout << "printing modifiedResultIndexesAccessed\n";
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+      std::cout << "printed modifiedResultIndexesAccessed\n";
+
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); ++it) {
+      //   cout << "modified index " << *it << ", last visited index var: "  << lastVisitedIndexVar << endl;
+      //   if (*it != lastVisitedIndexVar) {
+      //     std::cout << "modified index is not the last visited index variable\n";
+      //     modifiedResultIndexesAccessed.back() = *it;
+          
+      //   }
+      //   else {
+      //     cout << "modified index " << *it << " is the last visited index var " << lastVisitedIndexVar << endl;
+      //   }
+      // }
+      // for (unsigned long j=0; j<modifiedResultIndexesAccessed.size(); j++) {
+      //   std::cout << "modifiedResultIndexesAccessed: " << modifiedResultIndexesAccessed[j] << std::endl;
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+          find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+            != modifiedResultIndexesAccessed.end() ||
+          find(indexesVisited.begin(), indexesVisited.end(), indexVar) 
+            != indexesVisited.end()
+        ) {
+          std::cout << "consumer var: " << indexVar << std::endl;
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  }
+
+  return fissionFromBack;
+}
+
+bool checkFromFront(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from front function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar firstVisitedIndexVar = resultIndexesVisited.front();
+
+  std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl;
+  std::cout << "tensor path size: " << tensorPaths.size() << std::endl;
+
+  bool onlyFirstTensorContainFirstIndexOfOutput = true;
+  bool fissionFromFront = false;
+
+  // check from the front
+  for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality
+    std::cout << "i: " << i << std::endl;
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    if (i != 0) { // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << firstVisitedIndexVar << endl;
+        if (index == firstVisitedIndexVar) {
+          onlyFirstTensorContainFirstIndexOfOutput = false;
+        }
+      }
+    } 
+  }
+
+
+  if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.front();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the front\n";
+    auto firstTensorFirstVisited = indexesVisited.front();
+    cout << "first index first visited " << firstTensorFirstVisited << endl;
+
+    if (firstTensorFirstVisited == firstVisitedIndexVar) {
+      cout << "we can diffuse from the front\n";
+      fissionFromFront = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index
+          producerVars.emplace_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      std::cout << "printing modifiedResultIndexesAccessed\n";
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+      std::cout << "printed modifiedResultIndexesAccessed\n";
+
+      // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "modified result indexes accessed: " << resultIndexesVisited[j];
+      //   modifiedResultIndexesAccessed.emplace_back(resultIndexesVisited[j]);
+      // }
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); it++) {
+      //   cout << "modified index " << *it << endl;
+      //   if (*it != firstVisitedIndexVar) {
+      //     std::cout << "modifying the last index \n";
+      //     modifiedResultIndexesAccessed.front() = *it;
+      //   }
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+        find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+          != modifiedResultIndexesAccessed.end() ||
+        find(indexesVisited.begin(), indexesVisited.end(), indexVar) != indexesVisited.end()) {
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  } else {
+    std::cout << "fission from the front is not possible\n";
+  }
+
+  
+  return fissionFromFront;
+
+}
+
+
+// let's assume the user gives the removable index node and 
+// the removable expression from front or end
+
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, 
+  std::string side, int iters) {
+  std::cout << "executing travese operation written by me\n";
+
+  if (iters < 1) {
+    return stmt;
+  }
+
+  // IndexVar keeps i, j, k, l, values.
+  // so if we know what index to remove, the rest defines
+  // the order of the producer
+  struct SortedIndexVars : public IndexNotationVisitor {
+    using IndexNotationVisitor::visit;
+    map <IndexVar, ParallelUnit> forallParallelUnit;
+    map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+    vector<IndexVar> sortedIndexes;
+    Assignment innerBody;
+
+    SortedIndexVars() {};
+
+    void visit(const ForallNode* node) {
+      Forall forallNode(node);
+      IndexVar i = forallNode.getIndexVar();
+      std::cout << forallNode << std::endl;
+
+      sortedIndexes.push_back(i);
+      forallParallelUnit[i] = forallNode.getParallelUnit();
+      forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy();
+
+      if (isa<Assignment>(forallNode.getStmt())) {
+        cout << "assignment node found: " << forallNode.getStmt() << endl;;
+        innerBody = to<Assignment>(forallNode.getStmt());
+        return; // Only reorder first contiguous section of ForAlls
+      }
+
+      IndexNotationVisitor::visit(node);
+    }
+  };
+
+  std::cout << "traversing through the index statement\n";
+  SortedIndexVars sortedIndexVars;
+  stmt.accept(&sortedIndexVars);
+  std::cout << std::endl;
+
+  struct IndexExprBuilder : public IndexNotationVisitor {
+
+    using IndexNotationVisitor::visit;
+    vector<Access> accessLeftToRight;
+    map<IndexVar, vector<pair<Dimension,Type>>> indexDimensionsMap;
+
+    void visit(const AccessNode* node) {
+      Access accessNode(node);
+      std::cout << "access node: " << accessNode << std::endl;
+      accessLeftToRight.push_back(accessNode);
+
+      TensorVar tensorVar = accessNode.getTensorVar();
+
+      for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) {
+        auto var = accessNode.getIndexVars()[i];
+
+        if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) {
+          indexDimensionsMap[var].emplace_back(
+            pair<Dimension,Type>(tensorVar.getType().getShape().getDimension(i),
+            tensorVar.getType()));
+        }
+        else {
+          indexDimensionsMap[var] = {
+            pair<Dimension,Type>(
+              tensorVar.getType().getShape().getDimension(i),
+              tensorVar.getType())
+          };
+        }
+      }
+
+    }
+
+  };
+
+  IndexExpr rhsExpr = assignment.getRhs();
+  Access lhsAccess = to<Access>(assignment.getLhs());
+  std::cout << "right hand side expression: " << rhsExpr << std::endl;
+  IndexExprBuilder indexExprBuilder;
+  rhsExpr.accept(&indexExprBuilder);
+  TensorVar resultVar = lhsAccess.getTensorVar();
+
+  for (auto item : indexExprBuilder.indexDimensionsMap) {
+    auto indexVar = item.first;
+    cout << "var: " << indexVar << " ";
+    for (auto elem : item.second) {
+      cout << elem.first << " " << elem.second << " " ;
+    }
+    cout << endl;
+  }
+
+
+  // now I have the iteration graph
+  IterationGraph iterationGraph = IterationGraph::make(assignment);
+  std::cout << "/*******************************************/\n";
+  std::cout << "/********** ITERATION GRAPH ****************/\n";
+  std::cout << "/*******************************************/\n";
+  std::cout << iterationGraph << std::endl;
+
+  const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath();
+  const std::vector<TensorPath>& tensorPaths = iterationGraph.getTensorPaths();
+  
+
+  string removedAccessNode;
+  vector<IndexVar> producerVars; // producer accessed index variables
+  vector<IndexVar> consumerVars; // consumer accessed index variables
+  vector<IndexVar> fusedVars;
+  vector<IndexVar> modifiedResultIndexesAccessed;
+  bool fissionFromBack = false;
+  if (side == "b") {
+    fissionFromBack = true;
+  }
+
+  if (fissionFromBack) {
+    fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, 
+      removedAccessNode, producerVars, consumerVars,
+      modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+    );
+  }
+
+  bool fissionFromFront = false;
+  if (side == "f") {
+    fissionFromFront = true;
+  }
+  if (fissionFromBack == false && fissionFromFront) {
+      fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, 
+        removedAccessNode, producerVars, consumerVars,
+        modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+      );
+  }  
+
+  if (!fissionFromBack && !fissionFromFront) {
+    cout << "fission operation cannot be performed from the back\n";
+    return stmt;
+  }
+
+  vector<Dimension> newAccessDims{};
+  for (auto var : modifiedResultIndexesAccessed) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    newAccessDims.emplace_back(item[0].first);
+  }
+  TensorVar newAccessVar(resultVar.getName() + "_inner", 
+              Type(resultVar.getType().getDataType(), newAccessDims));
+  cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl;
+  Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed);
+  cout << "new access variable for iterative apply: " << newResultAccess << std::endl;
+
+  if (fissionFromBack) {
+    std::cout << "fission from the back is possible\n";
+  }
+  if (fissionFromFront) {
+    std::cout << "fission from the front is possible\n";
+  }
+
+  // // check from the front
+  // struct IndexExprSeparator : public IndexNotationVisitor {
+
+  //   using IndexNotationVisitor::visit;
+  //   vector<Access> accessLeftToRight;
+
+  //   void visit(const MulNode* node) {
+  //     Mul mulNode(node);
+  //     IndexExpr lhs = mulNode.getA();
+  //     IndexExpr rhs = mulNode.getB();
+  //     std::cout << "access node: " << accessNode << std::endl;
+  //     accessLeftToRight.push_back(accessNode);
+  //   }
+
+  // };
+
+
+  cout << "\n\nProducer accessed index variables\n";
+  auto it = producerVars.begin();
+  for (; it != producerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << "\n\nConsumer accessed index variables\n";
+  it = consumerVars.begin();
+  for (; it != consumerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << endl << endl;
+
+  // check common vars that can be fused
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+    find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) {
+      fusedVars.emplace_back(var);
+    }
+    else {
+      break;
+    }
+  }
+
+  for (auto& fv : fusedVars) {
+    std::cout << "fusable vars: " << fv << std::endl;
+  }
+
+  vector<IndexVar> sharedVars;
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() &&
+      find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+      find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()
+    ) {
+      sharedVars.emplace_back(var);
+    }
+  }
+
+  for (auto& sv : sharedVars) {
+    std::cout << "shared vars: " << sv << std::endl;
+  }
+
+  vector<Dimension> sharedDims{};
+  for (auto var : sharedVars) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    sharedDims.emplace_back(item[0].first);
+  }
+
+
+  // get removing tensorvars and workspace dimension
+  const Type& type = resultTensorPath.getAccess().getTensorVar().getType();
+  const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat();
+  TensorVar intermediateTensor("ws", type, format);
+  cout << intermediateTensor << endl;
+
+  // TensorVar A("A", Type(), taco::dense);
+  TensorVar tempVar("t" + resultVar.getName(), 
+                Type(resultVar.getType().getDataType(), sharedDims));
+  cout << "tensor order: " << tempVar.getOrder() << endl;
+  cout << "tensor format: " << tempVar.getFormat() << endl;
+  cout << "format order: " << tempVar.getFormat().getOrder() << endl;
+  
+  // TensorVar* a = new TensorVar("A", Type());
+  // TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  // get removing indexExpr and the rest of the indexExpr
+  Access workspace(tempVar, sharedVars);
+  std::cout << "workspace access tensor: " << workspace << std::endl;
+
+
+  
+  // construct producer expression right hand side
+  cout << "generating consumer expression\n";
+  IndexExpr producerExpr;
+  int num_muls = 0;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode != accessNode.getTensorVar().getName()) {
+      if (producerExpr == NULL) {
+        std::cout << "index expression is null";
+        producerExpr = accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      } else {
+        num_muls++;
+        producerExpr = producerExpr * accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      }
+    }
+  }
+  std::cout << producerExpr << std::endl;
+  Assignment producerAssignment(newResultAccess,
+    producerExpr);
+  std::cout << "new inner assignment statement: " << producerAssignment << std::endl;
+  Assignment producerInnerBody(workspace,
+    producerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+  std::cout << "producerInnerBody: " << producerInnerBody << std::endl;
+
+  // construct consumer expression right hand side
+  IndexExpr consumerExpr;
+  if (fissionFromBack) {
+    consumerExpr = workspace;
+  }
+  cout << "generating consumer expression: " << consumerExpr << std::endl;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    TensorVar tv = accessNode.getTensorVar();
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode == accessNode.getTensorVar().getName()) {
+      if (consumerExpr == NULL) {
+        std::cout << "index expression is null";
+        consumerExpr = accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      } else {
+        consumerExpr = consumerExpr * accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      }
+    }
+  }
+  if (fissionFromFront) {
+    consumerExpr = consumerExpr * workspace;
+  }
+  Assignment consumerInnerBody(lhsAccess,
+    consumerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+
+  cout << "Producer inner body: " << producerInnerBody << endl;
+  cout << "Consumer inner body: " << consumerInnerBody << endl;
+
+  // rewrite indexstmt
+  // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
+  struct ProducerConsumerRewriter : public IndexNotationRewriter {
+    using IndexNotationRewriter::visit;
+
+    const vector<IndexVar>& producerConsumerVars;
+    const vector<IndexVar>& fusedVars;
+    IndexStmt innerBody;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    ProducerConsumerRewriter(const vector<IndexVar>& producerConsumerVars, 
+                    const vector<IndexVar>& fusedVars, IndexStmt innerBody,
+                    const map <IndexVar, ParallelUnit> forallParallelUnit,
+                    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+        : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody),
+        forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy)  {
+    }
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+
+      // first forall must be in collected variables
+      // taco_iassert(util::contains(producerVars, i));
+      // std::cout << "\ninner body of the statement\n" << innerBody;
+      // // done in reverse order?
+      // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
+      //   stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
+      // }
+      stmt = rewrite(foralli.getStmt());
+      cout << "after rewrite statement: " << stmt << endl;
+
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() &&
+          find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) {
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+    }
+
+    void visit (const AssignmentNode* node) {
+      cout << "assignment node: " << node << endl;
+      stmt = innerBody;
+      cout << "producerStmt: " << innerBody << endl;
+      cout << "stmt: " << stmt << endl;
+    }
+
+  };
+  ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, 
+              producerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt producerStmt = producerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Producer rewriter\n";
+  std::cout << producerStmt << std::endl;
+  if (num_muls > 1) {
+    producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, 
+      side, iters-1);
+  }
+  
+
+  ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, 
+              consumerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt consumerStmt = consumerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Consumer rewriter\n";
+  std::cout << consumerStmt << std::endl;
+
+
+  struct CombineProducerConsumerRewriter : public IndexNotationRewriter {
+
+    const vector<IndexVar>& fusedVars;
+    IndexStmt consumerStmt;
+    IndexStmt producerStmt;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    CombineProducerConsumerRewriter(const vector<IndexVar>& fusedVars, 
+      IndexStmt producerStmt, IndexStmt consumerStmt, 
+      const map <IndexVar, ParallelUnit> forallParallelUnit,
+      const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+      : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt),
+      forallParallelUnit(forallParallelUnit), 
+      forallOutputRaceStrategy(forallOutputRaceStrategy) {}
+  
+    using IndexNotationRewriter::visit;
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+      
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) {
+        cout << "fused var in stmt\n";
+        stmt = rewrite(foralli.getStmt());
+        cout << "rewritten stmt: " << stmt << endl;
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+      else {
+        cout << "fused var not in  stmt\n";
+        cout << "producerStmt: " << producerStmt << endl;
+        cout << "consumerStmt: " << consumerStmt << endl;
+        stmt = where(consumerStmt, producerStmt);
+        cout << "where stmt: " << stmt << endl;
+      }
+
+      cout << "after rewrite statement: " << stmt << endl;
+    }
+  
+  };
+
+  CombineProducerConsumerRewriter combineRewriter(fusedVars, 
+              producerStmt, consumerStmt, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt combinedStmt = combineRewriter.rewrite(stmt);
+  std::cout << "\nAfter Combine rewriter\n";
+  std::cout << combinedStmt << std::endl;
+
+
+  return combinedStmt;
+  
+}
+
 
 IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
+  std::cout << "executing reorderLoopsTopologically\n";
   // Collect tensorLevelVars which stores the pairs of IndexVar and tensor
   // level that each tensor is accessed at
   struct DAGBuilder : public IndexNotationVisitor {
@@ -1382,8 +2079,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
 
   Iterators iterators(stmt);
+  std::cout << "DAG builder with iterators" << std::endl;
   DAGBuilder dagBuilder(iterators);
   stmt.accept(&dagBuilder);
+  std::cout << "After DAGBuilder\n";
+  std::cout << stmt << std::endl;
 
   // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars
   map<string, vector<pair<IndexVar, bool>>> tensorVarOrders;
@@ -1391,6 +2091,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
     tensorVarOrders[tensorLevelVar.first] = 
         varOrderFromTensorLevels(tensorLevelVar.second);
   }
+  // hard dependencies
   const auto hardDeps = depsFromVarOrders(tensorVarOrders);
 
   struct CollectSoftDependencies : public IndexNotationVisitor {
@@ -1412,12 +2113,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
       }
     }
   };
+  // soft dependencies
   CollectSoftDependencies collectSoftDeps;
   stmt.accept(&collectSoftDeps);
+  std::cout << "After CollectSoftDependencies\n";
+  std::cout << stmt << std::endl;
 
+  // topological sort
   const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, 
                                             dagBuilder.indexVarOriginalOrder);
 
+  // rewrite indexstmt
   // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
   struct TopoReorderRewriter : public IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -1440,7 +2146,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
 
       // first forall must be in collected variables
       taco_iassert(util::contains(sortedVars, i));
+      std::cout << "\ninner body of the statement\n" << innerBody;
       stmt = innerBody;
+      // done in reverse order?
       for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
         stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
       }
@@ -1450,7 +2158,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
   TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, 
                                dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy);
-  return rewriter.rewrite(stmt);
+  IndexStmt stmtChanged = rewriter.rewrite(stmt);
+  std::cout << "After TopoReorderRewriter\n";
+  std::cout << stmtChanged << std::endl;
+
+  return stmtChanged;
 }
 
 IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, 
@@ -1478,6 +2190,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph,
 
     void visit(const ForallNode* node) {
       Forall foralli(node);
+      std::cout << "scalar promote: " << foralli << std::endl;
       IndexVar i = foralli.getIndexVar();
 
       // Don't allow hoisting out of forall's for GPU warp and block reduction
diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp
index a1997a9b7..0bc848148 100644
--- a/src/ir/ir_printer.cpp
+++ b/src/ir/ir_printer.cpp
@@ -1,6 +1,7 @@
 #include <sstream>
 #include <iostream>
 
+#include "taco/cuda.h"
 #include "taco/ir/ir.h"
 #include "taco/ir/ir_printer.h"
 #include "taco/ir/simplify.h"
@@ -59,10 +60,13 @@ void IRPrinter::print(Stmt stmt) {
 }
 
 void IRPrinter::visit(const Literal* op) {
+
   if (color) {
     stream << blue ;
   }
 
+  // It seems this is where all the types get printed in the final code generation.
+  // Come up with a way to generate different values if stream2 is used to generate ispc code
   switch (op->type.getKind()) {
     case Datatype::Bool:
       stream << op->getValue<bool>();
@@ -99,11 +103,11 @@ void IRPrinter::visit(const Literal* op) {
     break;
     case Datatype::Float32:
       stream << ((op->getValue<float>() != 0.0)
-                 ? util::toString(op->getValue<float>()) : "0.0");
+                ? util::toString(op->getValue<float>()) : "0.0");
     break;
     case Datatype::Float64:
       stream << ((op->getValue<double>()!=0.0)
-                 ? util::toString(op->getValue<double>()) : "0.0");
+                ? util::toString(op->getValue<double>()) : "0.0");
     break;
     case Datatype::Complex64: {
       std::complex<float> val = op->getValue<std::complex<float>>();
@@ -123,6 +127,10 @@ void IRPrinter::visit(const Literal* op) {
   if (color) {
     stream << nc;
   }
+
+    
+
+  
 }
 
 void IRPrinter::visit(const Var* op) {
@@ -132,6 +140,7 @@ void IRPrinter::visit(const Var* op) {
   else {
     stream << op->name;
   }
+
 }
 
 void IRPrinter::visit(const Neg* op) {
@@ -283,6 +292,7 @@ void IRPrinter::visit(const IfThenElse* op) {
     stream << "}";
   }
   stream << endl;
+
 }
 
 void IRPrinter::visit(const Case* op) {
@@ -377,12 +387,13 @@ void IRPrinter::visit(const Store* op) {
   op->data.accept(this);
   stream << ";";
   stream << endl;
+
 }
 
 void IRPrinter::visit(const For* op) {
   doIndent();
   stream << keywordString("for") << " (" 
-         << keywordString(util::toString(op->var.type())) << " ";
+        << keywordString(util::toString(op->var.type())) << " ";
   op->var.accept(this);
   stream << " = ";
   op->start.accept(this);
@@ -396,7 +407,7 @@ void IRPrinter::visit(const For* op) {
 
   auto lit = op->increment.as<Literal>();
   if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                         (lit->type.isUInt() && lit->equalsScalar(1)))) {
+                        (lit->type.isUInt() && lit->equalsScalar(1)))) {
     stream << "++";
   }
   else {
@@ -408,7 +419,8 @@ void IRPrinter::visit(const For* op) {
   op->contents.accept(this);
   doIndent();
   stream << "}";
-  stream << endl;
+  stream << endl;    
+
 }
 
 void IRPrinter::visit(const While* op) {
@@ -452,6 +464,7 @@ void IRPrinter::visit(const Function* op) {
 
   doIndent();
   stream << "}";
+
 }
 
 void IRPrinter::visit(const VarDecl* op) {
@@ -470,6 +483,7 @@ void IRPrinter::visit(const VarDecl* op) {
   op->rhs.accept(this);
   stream << ";";
   stream << endl;
+
 }
 
 void IRPrinter::visit(const Assign* op) {
@@ -483,7 +497,7 @@ void IRPrinter::visit(const Assign* op) {
       if (add->a == op->lhs) {
         const Literal* lit = add->b.as<Literal>();
         if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                               (lit->type.isUInt() && lit->equalsScalar(1)))) {
+                              (lit->type.isUInt() && lit->equalsScalar(1)))) {
           stream << "++";
         }
         else {
diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp
index eed6f2bab..2e4827497 100644
--- a/src/ir/ir_rewriter.cpp
+++ b/src/ir/ir_rewriter.cpp
@@ -292,6 +292,7 @@ void IRRewriter::visit(const Store* op) {
 }
 
 void IRRewriter::visit(const For* op) {
+  // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl;
   Expr var       = rewrite(op->var);
   Expr start     = rewrite(op->start);
   Expr end       = rewrite(op->end);
diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp
index 77735a8d2..b25f820c1 100644
--- a/src/lower/iteration_graph.cpp
+++ b/src/lower/iteration_graph.cpp
@@ -64,8 +64,9 @@ IterationGraph IterationGraph::make(Assignment assignment) {
     oldToSplitVar.insert({indexVar, indexVar});
   }
 
+  // access nodes of right hand side
   match(expr,
-    function<void(const AccessNode*)>([&](const AccessNode* op) {
+    function<void(const AccessNode*)>([&](const AccessNode* op) {      
       auto type = op->tensorVar.getType();
       taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size())
           << "Tensor access " << IndexExpr(op) << " but tensor format only has "
diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp
index b4c9ea710..e8947337d 100644
--- a/src/lower/lowerer_impl_imperative.cpp
+++ b/src/lower/lowerer_impl_imperative.cpp
@@ -421,7 +421,6 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
   Expr var = getTensorVar(result);
 
   const bool needComputeAssign = util::contains(needCompute, result);
-
   Expr rhs;
   if (needComputeAssign) {
     rhs = lower(assignment.getRhs());
@@ -817,7 +816,6 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
                               forall.getStmt(), reducedAccesses);
   }
 //  taco_iassert(loops.defined());
-
   if (!generateComputeCode() && !hasStores(loops)) {
     // If assembly loop does not modify output arrays, then it can be safely
     // omitted.
diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h
index 4f5dc49af..da52fb782 100644
--- a/src/lower/tensor_path.h
+++ b/src/lower/tensor_path.h
@@ -2,6 +2,7 @@
 #define TACO_TENSOR_PATH_H
 
 #include <memory>
+#include <ostream>
 #include <vector>
 
 #include "taco/util/comparable.h"
@@ -47,14 +48,13 @@ class TensorPath : public util::Comparable<TensorPath> {
 
   friend bool operator==(const TensorPath&, const TensorPath&);
   friend bool operator<(const TensorPath&, const TensorPath&);
+  friend std::ostream& operator<<(std::ostream&, const TensorPath&);
 
 private:
   struct Content;
   std::shared_ptr<Content> content;
 };
 
-std::ostream& operator<<(std::ostream&, const TensorPath&);
-
 
 /// A step along a tensor path.
 class TensorPathStep : public util::Comparable<TensorPathStep> {
diff --git a/src/tensor.cpp b/src/tensor.cpp
index fab437ff1..30a821c9d 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -10,6 +10,7 @@
 #include <utility>
 #include <mutex>
 
+#include "../test/util.h"
 #include "taco/cuda.h"
 #include "taco/format.h"
 #include "taco/taco_tensor_t.h"
@@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData,
 
 /// Pack coordinates into a data structure given by the tensor format.
 void TensorBase::pack() {
+  std::cout << "TensorBase::Pack() method\n";
   if (!needsPack()) {
     return;
   }
@@ -346,6 +348,7 @@ void TensorBase::pack() {
   taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0);
   const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize;
 
+  std::cout << "call helperFuncs\n";
   const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(),
                                               dimensions);
 
@@ -619,10 +622,12 @@ void TensorBase::compile() {
   IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment));
   stmt = reorderLoopsTopologically(stmt);
   stmt = insertTemporaries(stmt);
+  std::cout << "calling parallelizeOuterLoop(stmt)\n";
   stmt = parallelizeOuterLoop(stmt);
   compile(stmt, content->assembleWhileCompute);
 }
 void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
+  std::cout << "TensorBase::compile\n";
   if (!needsCompile()) {
     return;
   }
@@ -802,6 +807,63 @@ void TensorBase::assemble() {
   }
 }
 
+void TensorBase::compute(std::ofstream& statfile, std::string& sofile) {
+  taco_uassert(!needsCompile()) << error::compute_without_compile;
+  // if (!needsCompute()) {
+  //   return;
+  // }
+  setNeedsCompute(false);
+  // Sync operand tensors if needed.
+  auto operands = getTensors(getAssignment().getRhs());
+  for (auto& operand : operands) {
+    // std::cout << "operand: " << operand.second << std::endl;
+    operand.second.syncValues();
+    operand.second.removeDependentTensor(*this);
+  }
+
+  auto arguments = packArguments(*this);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), 
+      "\nkernel execution time: ", timevalue);
+  // this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    setNeedsAssemble(false);
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
+void TensorBase::compute(std::ofstream& statfile) {
+  taco_uassert(!needsCompile()) << error::compute_without_compile;
+  // if (!needsCompute()) {
+  //   return;
+  // }
+  setNeedsCompute(false);
+  // Sync operand tensors if needed.
+  auto operands = getTensors(getAssignment().getRhs());
+  for (auto& operand : operands) {
+    operand.second.syncValues();
+    operand.second.removeDependentTensor(*this);
+  }
+
+  auto arguments = packArguments(*this);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), 
+      "\nkernel execution time: ", timevalue);
+  // this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    setNeedsAssemble(false);
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::compute() {
   taco_uassert(!needsCompile()) << error::compute_without_compile;
   if (!needsCompute()) {
@@ -816,7 +878,9 @@ void TensorBase::compute() {
   }
 
   auto arguments = packArguments(*this);
+  std::cout << "running the compute function from the shared library\n";
   this->content->module->callFuncPacked("compute", arguments.data());
+  std::cout << "compute function executed\n";
 
   if (content->assembleWhileCompute) {
     setNeedsAssemble(false);
@@ -951,6 +1015,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     }
 
     // Lower packing and iterator code.
+    std::cout << "1 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   } else {
@@ -964,12 +1029,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     IndexVar indexVar;
     IndexStmt assignment = (packedScalar() = bufferVector(indexVar));
     IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment));
+    std::cout << "2 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
 
     // Define and lower iterator code.
     IndexStmt iterateStmt = Yield({}, packedScalar());
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   }
+  std::cout << "Compiling the helperModule\n";
   helperModule->compile();
 
   helperFunctionsMutex.lock();
diff --git a/taco-uml.wsd b/taco-uml.wsd
new file mode 100644
index 000000000..4b8e39802
--- /dev/null
+++ b/taco-uml.wsd
@@ -0,0 +1,411 @@
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|-- IRNode
+IRNode <|-- BaseStmtNode
+IRNode <|-- BaseExprNode
+BaseStmtNode <|-- StmtNode
+BaseExprNode <|-- ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|-- IRHandle
+IRHandle <|-- Expr
+IRHandle <|-- Stmt
+
+IRHandle "1" *-- "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|-- IRVisitor
+IRVisitorStrict <|-- IRPrinter
+IRVisitorStrict <|-- IRRewriter
+IRVisitor <|-- IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|-- ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|-- RemoveRedundantStatements
+IRRewriter <|-- RemoveRedundantLoops
+IRRewriter <|-- RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {
+    -class FindVars
+}
+
+class FindVars {}
+
+IRPrinter <|-- CodeGen
+CodeGen <|-- CodeGen_C
+CodeGen <|-- CodeGen_ISPC
+CodeGen <|-- CodeGen_CUDA
+
+IRVisitor <|-- FindVars
+CodeGen_ISPC +-- FindVars
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|-- IndexStmtNode
+Uncopyable <|-- IndexStmtNode
+Manageable <|-- IndexExprNode
+Uncopyable <|-- IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|-- IndexStmt
+IndexStmt "1" *-- "1" IndexStmtNode
+IntrusivePtr <|-- IndexExpr
+IndexExpr "1" *-- "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|-- IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|-- IndexNotationVisitor
+IndexNotationVisitorStrict <|-- IndexNotationPrinter
+IndexNotationVisitor <|-- Matcher
+
+IndexExprVisitorStrict <|-- IndexExprRewriterStrict
+IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict
+IndexExprRewriterStrict <|-- IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|-- IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|-- LowererImpl
+Lowerer "1" *-- "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|-- Visitor
+LowererImpl "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImpl : contains
+
+LowererImpl <|-- LowererImplImperative
+LowererImplImperative "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImplImperative : contains
+
+@enduml
\ No newline at end of file
diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c
new file mode 100644
index 000000000..9d0536b8c
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c
@@ -0,0 +1,177 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) {
+  int A18451_dimension = (int)(A1845->dimensions[0]);
+  int A18452_dimension = (int)(A1845->dimensions[1]);
+  double* restrict A1845_vals = (double*)(A1845->vals);
+
+  A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension));
+
+  A1845->vals = (uint8_t*)A1845_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) {
+  int A18451_dimension = (int)(A1845->dimensions[0]);
+  int A18452_dimension = (int)(A1845->dimensions[1]);
+  double* restrict A1845_vals = (double*)(A1845->vals);
+  int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]);
+  int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]);
+  int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]);
+  int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]);
+  int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]);
+  int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]);
+  double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals);
+  int A14751_dimension = (int)(A1475->dimensions[0]);
+  int A14752_dimension = (int)(A1475->dimensions[1]);
+  double* restrict A1475_vals = (double*)(A1475->vals);
+  int A14161_dimension = (int)(A1416->dimensions[0]);
+  int A14162_dimension = (int)(A1416->dimensions[1]);
+  double* restrict A1416_vals = (double*)(A1416->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) {
+    A1845_vals[pA1845] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) {
+    int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5];
+    for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) {
+      int32_t i1545A1845 = i1542 * A18452_dimension + i1545;
+      double ti1543A1845_val = 0.0;
+      for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) {
+        int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5];
+        int32_t i1545A1416 = i1543 * A14162_dimension + i1545;
+        for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) {
+          int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5];
+          int32_t i1545A1475 = i1544 * A14752_dimension + i1545;
+          ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416];
+        }
+      }
+      A1845_vals[i1545A1845] = ti1543A1845_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h
new file mode 100644
index 000000000..3d0c06f50
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416);
+#endif
diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c
new file mode 100644
index 000000000..edf8cdb16
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/taco_default.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) {
+  int A15381_dimension = (int)(A1538->dimensions[0]);
+  int A15382_dimension = (int)(A1538->dimensions[1]);
+  double* restrict A1538_vals = (double*)(A1538->vals);
+
+  A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension));
+
+  A1538->vals = (uint8_t*)A1538_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) {
+  int A15381_dimension = (int)(A1538->dimensions[0]);
+  int A15382_dimension = (int)(A1538->dimensions[1]);
+  double* restrict A1538_vals = (double*)(A1538->vals);
+  int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]);
+  int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]);
+  int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]);
+  int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]);
+  int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]);
+  int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]);
+  double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals);
+  int A14751_dimension = (int)(A1475->dimensions[0]);
+  int A14752_dimension = (int)(A1475->dimensions[1]);
+  double* restrict A1475_vals = (double*)(A1475->vals);
+  int A14161_dimension = (int)(A1416->dimensions[0]);
+  int A14162_dimension = (int)(A1416->dimensions[1]);
+  double* restrict A1416_vals = (double*)(A1416->vals);
+  int A14791_dimension = (int)(A1479->dimensions[0]);
+  int A14792_dimension = (int)(A1479->dimensions[1]);
+  double* restrict A1479_vals = (double*)(A1479->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) {
+    A1538_vals[pA1538] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) {
+    int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5];
+    for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) {
+      int32_t i1546A1538 = i1542 * A15382_dimension + i1546;
+      double ti1543A1538_val = 0.0;
+      for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) {
+        int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5];
+        for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) {
+          int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5];
+          for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) {
+            int32_t i1545A1475 = i1544 * A14752_dimension + i1545;
+            int32_t i1545A1416 = i1543 * A14162_dimension + i1545;
+            int32_t i1546A1479 = i1545 * A14792_dimension + i1546;
+            ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479];
+          }
+        }
+      }
+      A1538_vals[i1546A1538] = ti1543A1538_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h
new file mode 100644
index 000000000..54274569e
--- /dev/null
+++ b/test/kernels/mttkrp_gemm/taco_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c
new file mode 100644
index 000000000..a5e031e7a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c
@@ -0,0 +1,199 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  int* restrict A25312_pos = (int*)(A2531->indices[1][0]);
+  int* restrict A25312_crd = (int*)(A2531->indices[1][1]);
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+
+  A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6);
+  A25312_pos[0] = 0;
+  for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) {
+    A25312_pos[pA25312] = 0;
+  }
+  int32_t A25312_crd_size = 1048576;
+  A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size);
+  int32_t i1468A2531 = 0;
+
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    int32_t pA25312_begin = i1468A2531;
+
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      if (A25312_crd_size <= i1468A2531) {
+        A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2));
+        A25312_crd_size *= 2;
+      }
+      A25312_crd[i1468A2531] = i1468;
+      i1468A2531++;
+    }
+
+    A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin;
+  }
+
+  int32_t csA25312 = 0;
+  for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) {
+    csA25312 += A25312_pos[pA253120];
+    A25312_pos[pA253120] = csA25312;
+  }
+
+  A2531_vals = (double*)malloc(sizeof(double) * i1468A2531);
+
+  A2531->indices[1][0] = (uint8_t*)(A25312_pos);
+  A2531->indices[1][1] = (uint8_t*)(A25312_crd);
+  A2531->vals = (uint8_t*)A2531_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  double* restrict cage3_vals = (double*)(cage3->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+
+//   int32_t i1468A2531 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      double ti1469A2531_val = 0.0;
+      for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+        int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+        int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+        ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451];
+      }
+      A2531_vals[i1468cage3] = ti1469A2531_val;
+    //   i1468A2531++;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h
new file mode 100644
index 000000000..a9d6b760d
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so
new file mode 100755
index 000000000..c2c5ca30e
Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so differ
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c
new file mode 100644
index 000000000..7f710f6c1
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c
@@ -0,0 +1,190 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) {
+  int A25352_dimension = (int)(A2535->dimensions[1]);
+  double* restrict A2535_vals = (double*)(A2535->vals);
+
+  A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension));
+
+  A2535->vals = (uint8_t*)A2535_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) {
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int A1_dimension = (int)(A->dimensions[0]);
+  int* restrict A2_pos = (int*)(A->indices[1][0]);
+  int* restrict A2_crd = (int*)(A->indices[1][1]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int B2_dimension = (int)(B->dimensions[1]);
+  double* restrict B_vals = (double*)(B->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) {
+    C_vals[pC] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(dynamic, 1)
+  for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i = i0 * 16 + i1;
+      if (i >= A1_dimension)
+        continue;
+
+      for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) {
+        int32_t jposA = jpos0 * 4;
+        if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) {
+          for (int32_t k = 0; k < B2_dimension; k++) {
+            int32_t kC = i * C2_dimension + k;
+            for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) {
+              int32_t jposA = jpos0 * 4 + jpos1;
+              if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)])
+                continue;
+
+              int32_t j = A2_crd[jposA];
+              int32_t kB = j * B2_dimension + k;
+              C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB];
+            }
+          }
+        }
+        else {
+          #pragma clang loop interleave(enable) vectorize(enable)
+          for (int32_t k = 0; k < B2_dimension; k++) {
+            int32_t kC = i * C2_dimension + k;
+            for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) {
+              int32_t jposA = jpos0 * 4 + jpos1;
+              int32_t j = A2_crd[jposA];
+              int32_t kB = j * B2_dimension + k;
+              C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB];
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h
new file mode 100644
index 000000000..cf0cf205c
--- /dev/null
+++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so
new file mode 100755
index 000000000..398362532
Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_spmm.so differ
diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c
new file mode 100644
index 000000000..1572bce5a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/fused_kernel.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14592_dimension = (int)(A1459->dimensions[1]);
+  double* restrict A1459_vals = (double*)(A1459->vals);
+
+  A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension));
+
+  A1459->vals = (uint8_t*)A1459_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14591_dimension = (int)(A1459->dimensions[0]);
+  int A14592_dimension = (int)(A1459->dimensions[1]);
+  double* restrict A1459_vals = (double*)(A1459->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14511_dimension = (int)(A1451->dimensions[0]);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+  int A14551_dimension = (int)(A1455->dimensions[0]);
+  int A14552_dimension = (int)(A1455->dimensions[1]);
+  double* restrict A1455_vals = (double*)(A1455->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) {
+    A1459_vals[pA1459] = 0.0;
+  }
+  
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) {
+
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i1467 = i0 * 16 + i1;
+      if (i1467 >= A13921_dimension)
+        continue;
+
+      for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) {
+        int32_t i1468 = B2_crd[i1468B];
+        double tA1459_val = 0.0;
+        for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+          int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+          int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+          tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451];
+        }
+        for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) {
+          int32_t i1470A1459 = i1467 * A14592_dimension + i1470;
+          int32_t i1470A1455 = i1468 * A14552_dimension + i1470;
+          A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455];
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h
new file mode 100644
index 000000000..e67e5a761
--- /dev/null
+++ b/test/kernels/sddmm_spmm/fused_kernel.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so
new file mode 100755
index 000000000..10619e0ca
Binary files /dev/null and b/test/kernels/sddmm_spmm/fused_kernel.so differ
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.c b/test/kernels/sddmm_spmm/sddmm_ryan.c
new file mode 100644
index 000000000..760fb5361
--- /dev/null
+++ b/test/kernels/sddmm_spmm/sddmm_ryan.c
@@ -0,0 +1,210 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) {
+  int* restrict A25312_pos = (int*)(A2531->indices[1][0]);
+  int* restrict A25312_crd = (int*)(A2531->indices[1][1]);
+  double* restrict A2531_vals = (double*)(A2531->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+
+  A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6);
+  A25312_pos[0] = 0;
+  for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) {
+    A25312_pos[pA25312] = 0;
+  }
+  int32_t A25312_crd_size = 1048576;
+  A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size);
+  int32_t i1468A2531 = 0;
+
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    int32_t pA25312_begin = i1468A2531;
+
+    for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+      int32_t i1468 = cage32_crd[i1468cage3];
+      if (A25312_crd_size <= i1468A2531) {
+        A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2));
+        A25312_crd_size *= 2;
+      }
+      A25312_crd[i1468A2531] = i1468;
+      i1468A2531++;
+    }
+
+    A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin;
+  }
+
+  int32_t csA25312 = 0;
+  for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) {
+    csA25312 += A25312_pos[pA253120];
+    A25312_pos[pA253120] = csA25312;
+  }
+
+  A2531_vals = (double*)malloc(sizeof(double) * i1468A2531);
+
+  A2531->indices[1][0] = (uint8_t*)(A25312_pos);
+  A2531->indices[1][1] = (uint8_t*)(A25312_crd);
+  A2531->vals = (uint8_t*)A2531_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+
+  int A1_dimension = (int)(A->dimensions[0]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  int32_t jA = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) {
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      int32_t i = i0 * 16 + i1;
+      if (i >= C1_dimension)
+        continue;
+
+      for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) {
+        int32_t j = B2_crd[jB];
+        double tkA_val = 0.0;
+        for (int32_t k = 0; k < D2_dimension; k++) {
+          int32_t kC = i * C2_dimension + k;
+          int32_t kD = j * D2_dimension + k;
+          tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD];
+        }
+        A_vals[jB] = tkA_val;
+        // jA++;
+      }
+    }
+  }
+  return 0;
+
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h
new file mode 100644
index 000000000..f0f9e372a
--- /dev/null
+++ b/test/kernels/sddmm_spmm/sddmm_ryan.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so
new file mode 100755
index 000000000..c3deae084
Binary files /dev/null and b/test/kernels/sddmm_spmm/sddmm_ryan.so differ
diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c
new file mode 100644
index 000000000..4f084ff5e
--- /dev/null
+++ b/test/kernels/sddmm_spmm/taco_original.c
@@ -0,0 +1,166 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14632_dimension = (int)(A1463->dimensions[1]);
+  double* restrict A1463_vals = (double*)(A1463->vals);
+
+  A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension));
+
+  A1463->vals = (uint8_t*)A1463_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) {
+  int A14632_dimension = (int)(A1463->dimensions[1]);
+  double* restrict A1463_vals = (double*)(A1463->vals);
+  int* restrict cage32_pos = (int*)(cage3->indices[1][0]);
+  int* restrict cage32_crd = (int*)(cage3->indices[1][1]);
+  double* restrict cage3_vals = (double*)(cage3->vals);
+  int A13921_dimension = (int)(A1392->dimensions[0]);
+  int A13922_dimension = (int)(A1392->dimensions[1]);
+  double* restrict A1392_vals = (double*)(A1392->vals);
+  int A14512_dimension = (int)(A1451->dimensions[1]);
+  double* restrict A1451_vals = (double*)(A1451->vals);
+  int A14552_dimension = (int)(A1455->dimensions[1]);
+  double* restrict A1455_vals = (double*)(A1455->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) {
+    for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) {
+      int32_t i1470A1463 = i1467 * A14632_dimension + i1470;
+      double ti1468A1463_val = 0.0;
+      for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) {
+        int32_t i1468 = cage32_crd[i1468cage3];
+        int32_t i1470A1455 = i1468 * A14552_dimension + i1470;
+        for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) {
+          int32_t i1469A1392 = i1467 * A13922_dimension + i1469;
+          int32_t i1469A1451 = i1468 * A14512_dimension + i1469;
+          ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455];
+        }
+      }
+      A1463_vals[i1470A1463] = ti1468A1463_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4]));
+}
diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h
new file mode 100644
index 000000000..71ce53402
--- /dev/null
+++ b/test/kernels/sddmm_spmm/taco_original.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455);
+#endif
diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so
new file mode 100755
index 000000000..f50931baa
Binary files /dev/null and b/test/kernels/sddmm_spmm/taco_original.so differ
diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c
new file mode 100644
index 000000000..605cc491f
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_default.c
@@ -0,0 +1,160 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+
+  A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension));
+
+  A2039->vals = (uint8_t*)A2039_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+  int A20351_dimension = (int)(A2035->dimensions[0]);
+  int A20352_dimension = (int)(A2035->dimensions[1]);
+  double* restrict A2035_vals = (double*)(A2035->vals);
+  int A14501_dimension = (int)(A1450->dimensions[0]);
+  int A14502_dimension = (int)(A1450->dimensions[1]);
+  double* restrict A1450_vals = (double*)(A1450->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) {
+    for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) {
+      int32_t i1520A2039 = i1517 * A20392_dimension + i1520;
+      double ti1519A2039_val = 0.0;
+      for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) {
+        int32_t i1519A2035 = i1517 * A20352_dimension + i1519;
+        int32_t i1520A1450 = i1519 * A14502_dimension + i1520;
+        ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450];
+      }
+      A2039_vals[i1520A2039] = ti1519A2039_val;
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h
new file mode 100644
index 000000000..769514531
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so
new file mode 100755
index 000000000..9de7a7933
Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_default.so differ
diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c
new file mode 100644
index 000000000..4a4e5faeb
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_template.c
@@ -0,0 +1,183 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) {
+  int A20391_dimension = (int)(A2039->dimensions[0]);
+  int A20392_dimension = (int)(A2039->dimensions[1]);
+  double* restrict A2039_vals = (double*)(A2039->vals);
+
+  A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension));
+
+  A2039->vals = (uint8_t*)A2039_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
+  int A1_dimension = (int)(A->dimensions[0]);
+  int A2_dimension = (int)(A->dimensions[1]);
+  double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int B2_dimension = (int)(B->dimensions[1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) {
+    A_vals[pA] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) {
+    for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) {
+      for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) {
+        for (int32_t i1 = 0; i1 < 16; i1++) {
+          int32_t i = i0 * 16 + i1;
+          if (i >= B1_dimension)
+            continue;
+
+          for (int32_t j1 = 0; j1 < 16; j1++) {
+            int32_t j = j0 * 16 + j1;
+            int32_t jB = i * B2_dimension + j;
+            int32_t jA = i * A2_dimension + j;
+            if (j >= C1_dimension)
+              continue;
+
+            double tk1A_val = 0.0;
+            for (int32_t k1 = 0; k1 < 16; k1++) {
+              int32_t k = k0 * 16 + k1;
+              int32_t kC = j * C2_dimension + k;
+              if (k >= C2_dimension)
+                continue;
+
+              tk1A_val += B_vals[jB] * C_vals[kC];
+            }
+            A_vals[jA] = A_vals[jA] + tk1A_val;
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h
new file mode 100644
index 000000000..769514531
--- /dev/null
+++ b/test/kernels/spmm_gemm/gemm_template.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450);
+#endif
diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so
new file mode 100755
index 000000000..2cfcd7ad3
Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_template.so differ
diff --git a/test/kernels/spmv_spmv/spmv_fused.c b/test/kernels/spmv_spmv/spmv_fused.c
new file mode 100644
index 000000000..0964fb8e1
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_fused.c
@@ -0,0 +1,178 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) {
+  double* restrict A_vals = (double*)(A->vals);
+
+  A_vals = (double*)malloc(sizeof(double) * 5);
+
+  A->vals = (uint8_t*)A_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) {
+  printf("Adhitha1\n");
+
+  double* restrict A_vals = (double*)(A->vals);
+  int* restrict C2_pos = (int*)(C->indices[1][0]);
+  int* restrict C2_crd = (int*)(C->indices[1][1]);
+  double* restrict C_vals = (double*)(C->vals);
+  double* restrict v_vals = (double*)(v->vals);
+  printf("Adhitha2\n");
+  int B1_dimension = (int)(B->dimensions[0]);
+  int C1_dimension = (int)(B->dimensions[0]);
+  printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  printf("Adhitha4\n");
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  printf("Adhitha2\n");
+  double* restrict B_vals = (double*)(B->vals);
+
+  printf("Adhitha3\n");
+
+  double* restrict tA = 0;
+  tA = (double*)malloc(sizeof(double) * C1_dimension);
+  for (int32_t ptA = 0; ptA < C1_dimension; ptA++) {
+    tA[ptA] = 0.0;
+  }
+  for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) {
+    double ti1440tA_val = 0.0;
+    for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) {
+      int32_t i1440 = C2_crd[i1440C];
+      ti1440tA_val += C_vals[i1440C] * v_vals[i1440];
+    }
+    tA[i1439] = ti1440tA_val;
+  }
+  for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) {
+    double ti1439A_val = 0.0;
+    for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) {
+      int32_t i1439 = B2_crd[i1439B];
+      ti1439A_val += B_vals[i1439B] * tA[i1439];
+    }
+    A_vals[i1438] = ti1439A_val;
+  }
+  free(tA);
+
+  A->vals = (uint8_t*)A_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h
new file mode 100644
index 000000000..bc78275ac
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_fused.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B);
+#endif
diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so
new file mode 100755
index 000000000..5efd6a4d8
Binary files /dev/null and b/test/kernels/spmv_spmv/spmv_fused.so differ
diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c
new file mode 100644
index 000000000..dfaa1c4b0
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_spmv_default.c
@@ -0,0 +1,157 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) {
+  double* restrict ref_vals = (double*)(ref->vals);
+
+  ref_vals = (double*)malloc(sizeof(double) * 5);
+
+  ref->vals = (uint8_t*)ref_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) {
+  double* restrict ref_vals = (double*)(ref->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int* restrict C2_pos = (int*)(C->indices[1][0]);
+  int* restrict C2_crd = (int*)(C->indices[1][1]);
+  double* restrict C_vals = (double*)(C->vals);
+  double* restrict v_vals = (double*)(v->vals);
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) {
+    double ti1439ref_val = 0.0;
+    for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) {
+      int32_t i1439 = B2_crd[i1439B];
+      for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) {
+        int32_t i1440 = C2_crd[i1440C];
+        ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440];
+      }
+    }
+    ref_vals[i1438] = ti1439ref_val;
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h
new file mode 100644
index 000000000..b53193484
--- /dev/null
+++ b/test/kernels/spmv_spmv/spmv_spmv_default.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v);
+#endif
diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c
new file mode 100644
index 000000000..5d40c8aa9
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused copy.c	
@@ -0,0 +1,248 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  int* restrict A15322_crd = (int*)(A1532->indices[1][1]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1));
+  A15322_pos[0] = 0;
+  for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) {
+    A15322_pos[pA15322] = 0;
+  }
+  int32_t A15322_crd_size = 1048576;
+  A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size);
+  int32_t i1543A1532 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15322_begin = i1543A1532;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15322_crd_size <= i1543A1532) {
+          A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2));
+          A15322_crd_size *= 2;
+        }
+        A15322_crd[i1543A1532] = i1543;
+        i1543A1532++;
+      }
+
+      A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin;
+    }
+  }
+
+  int32_t csA15322 = 0;
+  for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) {
+    csA15322 += A15322_pos[pA153220];
+    A15322_pos[pA153220] = csA15322;
+  }
+
+  A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension));
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->indices[1][1] = (uint8_t*)(A15322_crd);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A1532 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) {
+    A1532_vals[pA1532] = 0.0;
+  }
+
+  double* restrict rA1532_all = 0;
+  tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads());
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      double* restrict tA1532 = 0;
+      tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()];
+      // tA1532 = (double*)malloc(sizeof(double) * D1_dimension);
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) {
+          tA1532[ptA1532] = 0.0;
+        }
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C];
+          }
+        }
+        for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+          for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+            int32_t i1546A1532 = i1543B * A15323_dimension + i1546;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D];
+          }
+        }
+        // i1543A1532++;
+      }
+
+      
+    }
+    
+  }
+  free(tA1532_all);
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c
new file mode 100644
index 000000000..f490913cb
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused.c
@@ -0,0 +1,242 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  int* restrict A15322_crd = (int*)(A1532->indices[1][1]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1));
+  A15322_pos[0] = 0;
+  for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) {
+    A15322_pos[pA15322] = 0;
+  }
+  int32_t A15322_crd_size = 1048576;
+  A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size);
+  int32_t i1543A1532 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15322_begin = i1543A1532;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15322_crd_size <= i1543A1532) {
+          A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2));
+          A15322_crd_size *= 2;
+        }
+        A15322_crd[i1543A1532] = i1543;
+        i1543A1532++;
+      }
+
+      A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin;
+    }
+  }
+
+  int32_t csA15322 = 0;
+  for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) {
+    csA15322 += A15322_pos[pA153220];
+    A15322_pos[pA153220] = csA15322;
+  }
+
+  A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension));
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->indices[1][1] = (uint8_t*)(A15322_crd);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15321_dimension = (int)(A1532->dimensions[0]);
+  int A15323_dimension = (int)(A1532->dimensions[2]);
+  int* restrict A15322_pos = (int*)(A1532->indices[1][0]);
+  double* restrict A1532_vals = (double*)(A1532->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A1532 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) {
+    A1532_vals[pA1532] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      double* restrict tA1532 = 0;
+      tA1532 = (double*)malloc(sizeof(double) * D1_dimension);
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) {
+          tA1532[ptA1532] = 0.0;
+        }
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C];
+          }
+        }
+        for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+          for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+            int32_t i1546A1532 = i1543B * A15323_dimension + i1546;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D];
+          }
+        }
+        // i1543A1532++;
+      }
+
+      free(tA1532);
+    }
+  }
+
+  A1532->indices[1][0] = (uint8_t*)(A15322_pos);
+  A1532->vals = (uint8_t*)A1532_vals;
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h
new file mode 100644
index 000000000..d613c8f07
--- /dev/null
+++ b/test/kernels/ttm_ttm/fused.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so
new file mode 100755
index 000000000..69c65a1dc
Binary files /dev/null and b/test/kernels/ttm_ttm/fused.so differ
diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c
new file mode 100644
index 000000000..ee2b24e99
--- /dev/null
+++ b/test/kernels/ttm_ttm/gemm.c
@@ -0,0 +1,181 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) {
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+
+  A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension));
+
+  A2886->vals = (uint8_t*)A2886_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) {
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) {
+    A2886_vals[pA2886] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) {
+    for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) {
+      for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) {
+        for (int32_t i1552 = 0; i1552 < 32; i1552++) {
+          int32_t i1544 = i1551 * 32 + i1552;
+          if (i1544 >= C1_dimension)
+            continue;
+
+          for (int32_t i1554 = 0; i1554 < 32; i1554++) {
+            int32_t i1545 = i1553 * 32 + i1554;
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            if (i1545 >= D1_dimension)
+              continue;
+
+            for (int32_t i1556 = 0; i1556 < 32; i1556++) {
+              int32_t i1546 = i1555 * 32 + i1556;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              int32_t i1546A2886 = i1544 * A28862_dimension + i1546;
+              if (i1546 >= D2_dimension)
+                continue;
+
+              A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D];
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h
new file mode 100644
index 000000000..20cd2db53
--- /dev/null
+++ b/test/kernels/ttm_ttm/gemm.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c
new file mode 100644
index 000000000..e016491a2
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_1.c
@@ -0,0 +1,219 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) {
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1));
+  A23982_pos[0] = 0;
+  for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) {
+    A23982_pos[pA23982] = 0;
+  }
+  int32_t A23982_crd_size = 1048576;
+  A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size);
+  int32_t i1543A2398 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA23982_begin = i1543A2398;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A23982_crd_size <= i1543A2398) {
+          A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2));
+          A23982_crd_size *= 2;
+        }
+        A23982_crd[i1543A2398] = i1543;
+        i1543A2398++;
+      }
+
+      A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin;
+    }
+  }
+
+  int32_t csA23982 = 0;
+  for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) {
+    csA23982 += A23982_pos[pA239820];
+    A23982_pos[pA239820] = csA23982;
+  }
+
+  A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension));
+
+  A2398->indices[1][0] = (uint8_t*)(A23982_pos);
+  A2398->indices[1][1] = (uint8_t*)(A23982_crd);
+  A2398->vals = (uint8_t*)A2398_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) {
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+
+  // int32_t i1543A2398 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) {
+          // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545;
+          int32_t i1545A2398 = i1543B * A23983_dimension + i1545;
+          double ti1544A2398_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            int32_t i1545C = i1544 * C2_dimension + i1545;
+            ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C];
+          }
+          A2398_vals[i1545A2398] = ti1544A2398_val;
+        }
+        // i1543A2398++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h
new file mode 100644
index 000000000..4c631f227
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_1.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so
new file mode 100755
index 000000000..911c44fa1
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_1.so differ
diff --git a/test/kernels/ttm_ttm/ttm1_2.c b/test/kernels/ttm_ttm/ttm1_2.c
new file mode 100644
index 000000000..b04e23a54
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_2.c
@@ -0,0 +1,219 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) {
+  int A30561_dimension = (int)(A3056->dimensions[0]);
+  int A30563_dimension = (int)(A3056->dimensions[2]);
+  int* restrict A30562_pos = (int*)(A3056->indices[1][0]);
+  int* restrict A30562_crd = (int*)(A3056->indices[1][1]);
+  double* restrict A3056_vals = (double*)(A3056->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1));
+  A30562_pos[0] = 0;
+  for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) {
+    A30562_pos[pA30562] = 0;
+  }
+  int32_t A30562_crd_size = 1048576;
+  A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size);
+  int32_t i1543A3056 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA30562_begin = i1543A3056;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A30562_crd_size <= i1543A3056) {
+          A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2));
+          A30562_crd_size *= 2;
+        }
+        A30562_crd[i1543A3056] = i1543;
+        i1543A3056++;
+      }
+
+      A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin;
+    }
+  }
+
+  int32_t csA30562 = 0;
+  for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) {
+    csA30562 += A30562_pos[pA305620];
+    A30562_pos[pA305620] = csA30562;
+  }
+
+  A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension));
+
+  A3056->indices[1][0] = (uint8_t*)(A30562_pos);
+  A3056->indices[1][1] = (uint8_t*)(A30562_crd);
+  A3056->vals = (uint8_t*)A3056_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) {
+  int A30561_dimension = (int)(A3056->dimensions[0]);
+  int A30563_dimension = (int)(A3056->dimensions[2]);
+  double* restrict A3056_vals = (double*)(A3056->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int A28861_dimension = (int)(A2886->dimensions[0]);
+  int A28862_dimension = (int)(A2886->dimensions[1]);
+  double* restrict A2886_vals = (double*)(A2886->vals);
+
+  // int32_t i1543A3056 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) {
+          // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546;
+          int32_t i1546A3056 = i1543B * A30563_dimension + i1546;
+          double ti1544A3056_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            int32_t i1546A2886 = i1544 * A28862_dimension + i1546;
+            ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886];
+          }
+          A3056_vals[i1546A3056] = ti1544A3056_val;
+        }
+        // i1543A3056++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h
new file mode 100644
index 000000000..86ebdb633
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm1_2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so
new file mode 100755
index 000000000..c698ec991
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_2.so differ
diff --git a/test/kernels/ttm_ttm/ttm2.c b/test/kernels/ttm_ttm/ttm2.c
new file mode 100644
index 000000000..e98f44e35
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm2.c
@@ -0,0 +1,218 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) {
+  int A25931_dimension = (int)(A2593->dimensions[0]);
+  int A25933_dimension = (int)(A2593->dimensions[2]);
+  int* restrict A25932_pos = (int*)(A2593->indices[1][0]);
+  int* restrict A25932_crd = (int*)(A2593->indices[1][1]);
+  double* restrict A2593_vals = (double*)(A2593->vals);
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+
+  A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1));
+  A25932_pos[0] = 0;
+  for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) {
+    A25932_pos[pA25932] = 0;
+  }
+  int32_t A25932_crd_size = 1048576;
+  A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size);
+  int32_t i1543A2593 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= A23981_dimension)
+        continue;
+
+      int32_t pA25932_begin = i1543A2593;
+
+      for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) {
+        int32_t i1543 = A23982_crd[i1543A2398];
+        if (A25932_crd_size <= i1543A2593) {
+          A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2));
+          A25932_crd_size *= 2;
+        }
+        A25932_crd[i1543A2593] = i1543;
+        i1543A2593++;
+      }
+
+      A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin;
+    }
+  }
+
+  int32_t csA25932 = 0;
+  for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) {
+    csA25932 += A25932_pos[pA259320];
+    A25932_pos[pA259320] = csA25932;
+  }
+
+  A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension));
+
+  A2593->indices[1][0] = (uint8_t*)(A25932_pos);
+  A2593->indices[1][1] = (uint8_t*)(A25932_crd);
+  A2593->vals = (uint8_t*)A2593_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) {
+  int A25931_dimension = (int)(A2593->dimensions[0]);
+  int A25933_dimension = (int)(A2593->dimensions[2]);
+  double* restrict A2593_vals = (double*)(A2593->vals);
+  int A23981_dimension = (int)(A2398->dimensions[0]);
+  int A23983_dimension = (int)(A2398->dimensions[2]);
+  int* restrict A23982_pos = (int*)(A2398->indices[1][0]);
+  int* restrict A23982_crd = (int*)(A2398->indices[1][1]);
+  double* restrict A2398_vals = (double*)(A2398->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1543A2593 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= A23981_dimension)
+        continue;
+
+      for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+        //   int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546;
+          int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546;
+          double ti1545A2593_val = 0.0;
+          for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+            int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545;
+            int32_t i1546D = i1545 * D2_dimension + i1546;
+            ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D];
+          }
+          A2593_vals[i1546A2593] = ti1545A2593_val;
+        }
+        // i1543A2593++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]));
+}
diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h
new file mode 100644
index 000000000..40f1400d1
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so
new file mode 100755
index 000000000..16a3d2542
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm2.so differ
diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c
new file mode 100644
index 000000000..cb21b209f
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original copy 2.c	
@@ -0,0 +1,242 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) {
+    A1537_vals[pA1537] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+          int32_t i1544 = B3_crd[i1544B];
+          for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) {
+            for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) {
+              for (int32_t i1554 = 0; i1554 < 32; i1554++) {
+                int32_t i1545 = i1553 * 32 + i1554;
+                int32_t i1545C = i1544 * C2_dimension + i1545;
+                if (i1545 >= D1_dimension)
+                  continue;
+
+                for (int32_t i1556 = 0; i1556 < 32; i1556++) {
+                  int32_t i1546 = i1555 * 32 + i1556;
+                  // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+                  int32_t i1546A1537 = i1544B * A15373_dimension + i1546;
+                  int32_t i1546D = i1545 * D2_dimension + i1546;
+                  if (i1546 >= D2_dimension)
+                    continue;
+
+                  A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+                }
+              }
+            }
+          }
+        }
+        
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c
new file mode 100644
index 000000000..2db396c0a
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original copy.c	
@@ -0,0 +1,225 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+          // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+          int32_t i1546A1537 = i1543B * A15373_dimension + i1546;
+          double ti1544A1537_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+              int32_t i1545C = i1544 * C2_dimension + i1545;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+            }
+          }
+          A1537_vals[i1546A1537] = ti1544A1537_val;
+        }
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c
new file mode 100644
index 000000000..ac2674239
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original.c
@@ -0,0 +1,226 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  int* restrict A15372_pos = (int*)(A1537->indices[1][0]);
+  int* restrict A15372_crd = (int*)(A1537->indices[1][1]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1));
+  A15372_pos[0] = 0;
+  for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) {
+    A15372_pos[pA15372] = 0;
+  }
+  int32_t A15372_crd_size = 1048576;
+  A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size);
+  int32_t i1543A1537 = 0;
+
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      int32_t pA15372_begin = i1543A1537;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        int32_t i1543 = B2_crd[i1543B];
+        if (A15372_crd_size <= i1543A1537) {
+          A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2));
+          A15372_crd_size *= 2;
+        }
+        A15372_crd[i1543A1537] = i1543;
+        i1543A1537++;
+      }
+
+      A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin;
+    }
+  }
+
+  int32_t csA15372 = 0;
+  for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) {
+    csA15372 += A15372_pos[pA153720];
+    A15372_pos[pA153720] = csA15372;
+  }
+
+  A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension));
+
+  A1537->indices[1][0] = (uint8_t*)(A15372_pos);
+  A1537->indices[1][1] = (uint8_t*)(A15372_crd);
+  A1537->vals = (uint8_t*)A1537_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15371_dimension = (int)(A1537->dimensions[0]);
+  int A15373_dimension = (int)(A1537->dimensions[2]);
+  double* restrict A1537_vals = (double*)(A1537->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+  // int32_t i1543A1537 = 0;
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) {
+    for (int32_t i1548 = 0; i1548 < 16; i1548++) {
+      int32_t i1542 = i1547 * 16 + i1548;
+      if (i1542 >= B1_dimension)
+        continue;
+
+      for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) {
+        for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) {
+          // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546;
+          int32_t i1546A1537 = i1543B * A15373_dimension + i1546;
+          double ti1544A1537_val = 0.0;
+          for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) {
+            int32_t i1544 = B3_crd[i1544B];
+            for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) {
+              int32_t i1545C = i1544 * C2_dimension + i1545;
+              int32_t i1546D = i1545 * D2_dimension + i1546;
+              ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D];
+            }
+          }
+          A1537_vals[i1546A1537] = ti1544A1537_val;
+        }
+        // i1543A1537++;
+      }
+    }
+  }
+  return 0;
+}
+
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h
new file mode 100644
index 000000000..a27841047
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so
new file mode 100755
index 000000000..fa04aed35
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original.so differ
diff --git a/test/kernels/ttm_ttm/ttm_original2.c b/test/kernels/ttm_ttm/ttm_original2.c
new file mode 100644
index 000000000..8dd62d6dd
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original2.c
@@ -0,0 +1,229 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15421_dimension = (int)(A1542->dimensions[0]);
+  int A15423_dimension = (int)(A1542->dimensions[2]);
+  int* restrict A15422_pos = (int*)(A1542->indices[1][0]);
+  int* restrict A15422_crd = (int*)(A1542->indices[1][1]);
+  double* restrict A1542_vals = (double*)(A1542->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+
+  A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1));
+  A15422_pos[0] = 0;
+  for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) {
+    A15422_pos[pA15422] = 0;
+  }
+  int32_t A15422_crd_size = 1048576;
+  A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size);
+  int32_t i1548A1542 = 0;
+
+  for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) {
+    for (int32_t i1553 = 0; i1553 < 16; i1553++) {
+      int32_t i1547 = i1552 * 16 + i1553;
+      if (i1547 >= B1_dimension)
+        continue;
+
+      int32_t pA15422_begin = i1548A1542;
+
+      for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) {
+        int32_t i1548 = B2_crd[i1548B];
+        if (A15422_crd_size <= i1548A1542) {
+          A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2));
+          A15422_crd_size *= 2;
+        }
+        A15422_crd[i1548A1542] = i1548;
+        i1548A1542++;
+      }
+
+      A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin;
+    }
+  }
+
+  int32_t csA15422 = 0;
+  for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) {
+    csA15422 += A15422_pos[pA154220];
+    A15422_pos[pA154220] = csA15422;
+  }
+
+  A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension));
+
+  A1542->indices[1][0] = (uint8_t*)(A15422_pos);
+  A1542->indices[1][1] = (uint8_t*)(A15422_crd);
+  A1542->vals = (uint8_t*)A1542_vals;
+  return 0;
+}
+
+int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+  int A15421_dimension = (int)(A1542->dimensions[0]);
+  int A15423_dimension = (int)(A1542->dimensions[2]);
+  int* restrict A15422_pos = (int*)(A1542->indices[1][0]);
+  double* restrict A1542_vals = (double*)(A1542->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
+  int* restrict B2_pos = (int*)(B->indices[1][0]);
+  int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  double* restrict B_vals = (double*)(B->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
+  int D1_dimension = (int)(D->dimensions[0]);
+  int D2_dimension = (int)(D->dimensions[1]);
+  double* restrict D_vals = (double*)(D->vals);
+
+//   int32_t i1548A1542 = 0;
+
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) {
+    A1542_vals[pA1542] = 0.0;
+  }
+
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) {
+    for (int32_t i1553 = 0; i1553 < 16; i1553++) {
+      int32_t i1547 = i1552 * 16 + i1553;
+      if (i1547 >= B1_dimension)
+        continue;
+
+      for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) {
+        for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) {
+          int32_t i1549 = B3_crd[i1549B];
+          for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) {
+            int32_t i1550C = i1549 * C2_dimension + i1550;
+            for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) {
+            //   int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551;
+              int32_t i1551A1542 = i1548B * A15423_dimension + i1551;
+              int32_t i1551D = i1550 * D2_dimension + i1551;
+              A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D];
+            }
+          }
+        }
+        // i1548A1542++;
+      }
+    }
+  }
+  return 0;
+}
+#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h"
+int _shim_assemble(void** parameterPack) {
+  return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
+int _shim_compute(void** parameterPack) {
+  return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]));
+}
diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h
new file mode 100644
index 000000000..8a08b4548
--- /dev/null
+++ b/test/kernels/ttm_ttm/ttm_original2.h
@@ -0,0 +1,125 @@
+#ifndef TACO_C_HEADERS
+#define TACO_C_HEADERS
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <string.h>
+#include <omp.h>
+#if _OPENMP
+#include <omp.h>
+#endif
+#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
+#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
+#ifndef TACO_TENSOR_T_DEFINED
+#define TACO_TENSOR_T_DEFINED
+typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
+typedef struct {
+  int32_t      order;         // tensor order (number of modes)
+  int32_t*     dimensions;    // tensor dimensions
+  int32_t      csize;         // component size
+  int32_t*     mode_ordering; // mode storage ordering
+  taco_mode_t* mode_types;    // mode storage types
+  uint8_t***   indices;       // tensor index data (per mode)
+  uint8_t*     vals;          // tensor values
+  int32_t      vals_size;     // values array size
+} taco_tensor_t;
+#endif
+#if !_OPENMP
+int omp_get_thread_num() { return 0; }
+int omp_get_max_threads() { return 1; }
+#endif
+int cmp(const void *a, const void *b) {
+  return *((const int*)a) - *((const int*)b);
+}
+int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayStart] >= target) {
+    return arrayStart;
+  }
+  int lowerBound = arrayStart; // always < target
+  int upperBound = arrayEnd; // always >= target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return upperBound;
+}
+int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
+  if (array[arrayEnd] <= target) {
+    return arrayEnd;
+  }
+  int lowerBound = arrayStart; // always <= target
+  int upperBound = arrayEnd; // always > target
+  while (upperBound - lowerBound > 1) {
+    int mid = (upperBound + lowerBound) / 2;
+    int midValue = array[mid];
+    if (midValue < target) {
+      lowerBound = mid;
+    }
+    else if (midValue > target) {
+      upperBound = mid;
+    }
+    else {
+      return mid;
+    }
+  }
+  return lowerBound;
+}
+taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
+                                  int32_t* dimensions, int32_t* mode_ordering,
+                                  taco_mode_t* mode_types) {
+  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
+  t->order         = order;
+  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
+  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
+  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
+  t->csize         = csize;
+  for (int32_t i = 0; i < order; i++) {
+    t->dimensions[i]    = dimensions[i];
+    t->mode_ordering[i] = mode_ordering[i];
+    t->mode_types[i]    = mode_types[i];
+    switch (t->mode_types[i]) {
+      case taco_mode_dense:
+        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
+        break;
+      case taco_mode_sparse:
+        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
+        break;
+    }
+  }
+  return t;
+}
+void deinit_taco_tensor_t(taco_tensor_t* t) {
+  for (int i = 0; i < t->order; i++) {
+    free(t->indices[i]);
+  }
+  free(t->indices);
+  free(t->dimensions);
+  free(t->mode_ordering);
+  free(t->mode_types);
+  free(t);
+}
+#endif
+
+#ifndef TACO_GENERATED_assemble
+#define TACO_GENERATED_assemble
+int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
+
+#ifndef TACO_GENERATED_compute
+#define TACO_GENERATED_compute
+int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D);
+#endif
diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so
new file mode 100755
index 000000000..6466a2af2
Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original2.so differ
diff --git a/test/test.cpp b/test/test.cpp
index a49f10ff7..851493b7f 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) {
   ASSERT_TRUE(equals(expected, actual));
 }
 
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) {
+//   std::cout << "order: " << expected.getOrder();
+//   std::vector<int> modes{};
+//   for (int mode = 0; mode < expected.getOrder(); mode++) {
+//     if (expected.getDimension(mode) != actual.getDimension(mode)) {
+//       ASSERT_TRUE(false);
+//     }
+
+//     for (int i=0; i<expected.getDimension(mode); i++) {
+//       std::cout << expected(i) << " " << actual(i) << std::endl;
+//     }
+//   }
+// }
+
 std::string testDirectory() {
   return TO_STRING(TACO_TEST_DIR);
 }
diff --git a/test/test.h b/test/test.h
index 3302bf81f..1c8f5172e 100644
--- a/test/test.h
+++ b/test/test.h
@@ -61,6 +61,7 @@ void ASSERT_VECTOR_EQ(std::vector<T> expected,
 
 void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual);
 void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual);
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual);
 
 template <typename T>
 void ASSERT_COMPONENTS_EQUALS(vector<vector<vector<int>>> expectedIndices,
diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp
index e2a972430..ae80e5493 100644
--- a/test/tests-indexstmt.cpp
+++ b/test/tests-indexstmt.cpp
@@ -1,10 +1,13 @@
+#include "taco/index_notation/kernel.h"
+#include "taco/type.h"
 #include "test.h"
 #include "test_tensors.h"
 #include "taco/tensor.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/transformations.h"
 
 using namespace taco;
-const IndexVar i("i"), j("j"), k("k");
+const IndexVar i("i"), j("j"), k("k"), l("l"), m("m");
 
 TEST(indexstmt, assignment) {
   Type t(type<double>(), {3});
@@ -84,4 +87,192 @@ TEST(indexstmt, spmm) {
 }
 
 
+TEST(indexstmt, sddmm) {
+  Type t(type<double>(), {3,3});
+  TensorVar A("A", t, {Sparse, Dense});
+  TensorVar B("B", t, {Sparse, Dense});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar w("w", Type(type<double>(),{3}), Dense);
+
+  // the below expression is the concrete index notation
+  // where (consumer, producer)
+  IndexStmt spmm = forall(i,
+                     forall(k,
+                            where(forall(j, A(i,j) = w(j)),
+                                  forall(j,   w(j) += B(i,k)*C(k,j))
+                                  )
+                            )
+                     );
+
+  // after adding scheduling transformations to this concrete-topologically sorted index stmt
+  //
+
+  std::cout << spmm << std::endl;
+  spmm = reorderLoopsTopologically(spmm);
+  std::cout << "topologically reordered loops statement: " << spmm << std::endl;
+
+  Kernel kernel = compile(spmm);
+}
+
+TEST(indexstmt, sddmmPlusSpmm) {
+
+  // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l);
+  // indexstmt order i, j, k, l
+  //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces)
+
+  Type t(type<double>(), {3,3});
+  TensorVar Y("Y", t, {Dense, Dense});
+  TensorVar B("B", t, {Dense, Sparse});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar D("D", t, {Dense, Dense});
+  TensorVar E("E", t, {Dense, Dense});
+
+  // TensorVar A("A", Type(type<double>(),{3}), );
+  TensorVar A("A", Type());
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+
+  IndexStmt fused2 =
+  forall(i,
+    forall(j,
+      where(
+        forall(l, Y(i,l) += A * E(j,l)), // consumer
+        forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+} 
+
+
+
+TEST(indexstmt, mttkrpPlusSpmm) {
+
+  // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1
+
+  // i = 11, k = 5, l = 7, j = 8;
+  long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6;
+
+  Type atype(type<double>(), {idim, mdim});
+  Type btype(type<double>(), {idim, kdim, ldim});
+  Type ctype(type<double>(), {kdim, jdim});
+  Type dtype(type<double>(), {ldim, jdim});
+  Type etype(type<double>(), {jdim, mdim});
+
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Sparse, Sparse, Sparse});
+  TensorVar C("C", ctype, {Dense, Dense});
+  TensorVar D("D", dtype, {Dense, Dense});
+  TensorVar E("E", etype, {Dense, Dense});
+
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(k,
+      forall(l,
+        forall(j,
+          forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m))
+        )
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+  IndexStmt fused2 =
+  forall(i,
+    where(
+      forall(j,
+        forall(m, 
+          A(i,m) += ws(j) * E(j,m)
+        )
+      )
+      ,
+      forall(k,
+        forall(l,
+          forall(j, 
+            ws(j) += B(i,k,l) * C(k,j) * D(l,j)
+          )
+        )
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+}
+
+// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0
+TEST(indexstmt, mmPlusSpmv) {
+
+  //
+
+  long unsigned int idim = 11, jdim = 8, kdim = 5;
+
+  Type ytype(type<double>(), {idim});
+  Type atype(type<double>(), {idim, jdim});
+  Type btype(type<double>(), {jdim, kdim});
+  Type vtype(type<double>(), {kdim});
+
+  TensorVar y("y", ytype, {Dense});
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Dense, Dense});
+  TensorVar v("v", vtype, {Dense});
+  
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(m, y(i) += A(i,j) * B(j,k) * v(k))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1); 
+  
+  IndexStmt fused2 =
+  where(
+    forall(i,
+      forall(j, 
+        y(i) += A(i,j) * ws(j)
+      )
+    )
+    ,
+    forall(j,
+      forall(k,
+        ws(j) += B(j,k) * v(k)
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+}
+
 
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 52bd74ab4..3c5362118 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -1,42 +1,8 @@
-#include <taco/index_notation/transformations.h>
-#include <codegen/codegen_c.h>
-#include <codegen/codegen_cuda.h>
-#include <fstream>
-#include "test.h"
-#include "test_tensors.h"
-#include "taco/tensor.h"
-#include "taco/index_notation/index_notation.h"
-#include "taco/index_notation/transformations.h"
-#include "codegen/codegen.h"
-#include "taco/lower/lower.h"
-
-using namespace taco;
+#include "util.h"
+
 const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n");
 int WARP_SIZE = 32;
 
-void printToCout(IndexStmt stmt) {
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute", false, true);
-  codegen->compile(compute, true);
-}
-
-void printToFile(string filename, IndexStmt stmt) {
-  stringstream source;
-
-  string file_path = "eval_generated/";
-  mkdir(file_path.c_str(), 0777);
-
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute",  false, true);
-  codegen->compile(compute, true);
-
-  ofstream source_file;
-  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
-  source_file.open(file_path + filename + file_ending);
-  source_file << source.str();
-  source_file.close();
-}
-
 IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
   IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -107,6 +73,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
 }
 
+IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+  // return stmt.split(i, i0, i1, CHUNK_SIZE)
+  //         .pos(k, kpos, B(i,k))
+  //         .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+  //         .reorder({i0, i1, kpos0, j, kpos1});
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, B(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+}
+
 IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2");
   return stmt.fuse(i, j, f)
@@ -125,6 +112,13 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) {
                           OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) {
+  TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                         .as<Forall>().getStmt().as<Assignment>().getLhs()
+                         .getTensorVar();
+  return stmt.assemble(result, AssembleStrategy::Insert);
+}
+
 IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2");
   return stmt.fuse(i, j, f)
@@ -149,12 +143,30 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  IndexExpr precomputeExpr = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w);
+          // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics
+          // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
+  return stmt.split(i, i1, i2, CHUNK_SIZE);
+}
+
 IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -162,6 +174,12 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, m, j});
+}
+
 IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -805,7 +823,68 @@ TEST(scheduling_eval, sddmmCPU) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleSDDMMCPU(stmt, B);
 
-  //printToFile("sddmm_cpu", stmt);
+  printToFile("sddmm_cpu_ryan2", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, sddmmSPMMFusedCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCPU(stmt, B);
+
+  printToFile("sddmm_cpu_ryan2", stmt);
 
   A.compile(stmt);
   A.assemble();
@@ -819,6 +898,125 @@ TEST(scheduling_eval, sddmmCPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+
+TEST(scheduling_eval, sddmmcsrCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_K}, CSR);
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCSRCPU(stmt, B);
+
+  printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, CSR);
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  
+  IndexStmt stmt_ref = expected.getAssignment().concretize();
+  printToFile("sddmm_cpu_ref", stmt_ref);
+
+  expected.compile(stmt_ref);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+TEST(scheduling_eval, sddmm2CPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1021/10;
+  int NUM_K = 18;
+  float SPARSITY = .3;
+  Tensor<double> Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> X("X", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int i = 0; i < NUM_J; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  X.pack();
+
+  Y(i,j) = A(i,j) * X(i,k) * X(k,j);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  // // stmt = scheduleSDDMMCPU(stmt, A);
+
+  // printToFile("sddmm2_cpu", stmt);
+
+  // A.compile(stmt);
+  // A.assemble();
+  // A.compute();
+
+  // Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  // expected(i,j) = A(i,j) * X(i,k) * X(j,k);
+  // expected.compile();
+  // expected.assemble();
+  // expected.compute();
+  // ASSERT_TENSOR_EQ(expected, A);
+}
+
+
 TEST(scheduling_eval, spmvCPU) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -904,7 +1102,7 @@ TEST(scheduling_eval, ttvCPU) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleTTVCPU(stmt, B);
 
-  //printToFile("ttv_cpu", stmt);
+  printToFile("ttv_cpu", stmt);
 
   A.compile(stmt);
   A.assemble();
@@ -918,6 +1116,7 @@ TEST(scheduling_eval, ttvCPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+
 TEST(scheduling_eval, ttvCPU_CSR) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -928,7 +1127,7 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   int NUM_K = 1057/10;
   float SPARSITY = .3;
   Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Sparse});
-  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse});
   Tensor<double> c("c", {NUM_K}, Format({Dense}));
 
   srand(9536);
@@ -956,11 +1155,13 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleTTVCPUCSR(stmt);
 
+  printToFile("ttv_cpu_csr", stmt);
+
   A.compile(stmt);
   A.assemble();
   A.compute();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Sparse});
   expected(i,j) = B(i,j,k) * c(k);
   expected.compile();
   expected.assemble();
@@ -968,6 +1169,7 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+
 TEST(scheduling_eval, ttmCPU) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -1463,7 +1665,8 @@ TEST(scheduling_eval, mttkrpGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(generate_evaluation_files, DISABLED_cpu) {
+
+TEST(generate_evaluation_files, cpu) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -1779,10 +1982,13 @@ TEST(generate_evaluation_files, DISABLED_cpu) {
   }
 }
 
-TEST(generate_evaluation_files, DISABLED_gpu) {
-  if (!should_use_CUDA_codegen()) {
-    return;
-  }
+TEST(generate_evaluation_files, gpu) {
+  // if (!should_use_CUDA_codegen()) {
+  //   return;
+  // }
+  set_CUDA_codegen_enabled(true);
+
+  std::cout << "executing generate_evaluation_file.gpu\n";
 
   vector<vector<int>> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE}
   for (int i = 3; i <= 20; i++) {
diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp
new file mode 100644
index 000000000..2fbececfe
--- /dev/null
+++ b/test/tests-scheduling-fuse.cpp
@@ -0,0 +1,2780 @@
+#include "taco/cuda.h"
+#include "taco/tensor.h"
+#include "test.h"
+#include "util.h"
+#include <climits>
+#include "gtest/gtest.h"
+#include <cstdint>
+
+#define NUM_THREADS_TO_USE 1
+// #define NUM_THREADS_TO_USE 32
+
+// TEST(scheduling_eval, spmvFusedWithSyntheticData) {
+//   if (should_use_CUDA_codegen()) {
+//     return;
+//   }
+//   taco_set_num_threads(NUM_THREADS_TO_USE);
+
+//   std::default_random_engine gen(0);
+//   std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+//   Format csr({dense, sparse});
+//   Format  rm({dense});
+
+//   // uncomment this for reading the csr matrix saved in mtx file
+//   std::cout << "reading B mat mtx\n";
+
+//   int NUM_I = 5; // 1021/10;
+//   int NUM_J = 5; // 1039/10;
+//   int NUM_K = 8;
+//   float SPARSITY = .3;
+//   Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+//   srand(75883);
+//   for (int i = 0; i < NUM_I; i++) {
+//     for (int j = 0; j < NUM_J; j++) {
+//       float rand_float = (float)rand()/(float)(RAND_MAX);
+//       if (rand_float < SPARSITY) {
+//         B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+//       }
+//     }
+//   }
+//   B.pack();
+
+
+//   std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+//   std::cout << "adding c mat\n";
+//   Tensor<double> C("C", {NUM_J, NUM_K}, csr);
+//   for (int i = 0; i < C.getDimension(0); ++i) {
+//     for (int j = 0; j < C.getDimension(1); ++j) {
+//       C.insert({i,j}, unif(gen));
+//     }
+//   }
+//   std::cout << "packing C mat\n";
+//   C.pack();
+
+//   Tensor<double> v("v", {NUM_K}, rm);
+//   for (int i = 0; i < v.getDimension(0); ++i) {
+//       v.insert({i}, unif(gen));
+//   }
+//   std::cout << "packing D mat\n";
+//   v.pack();
+
+//   Tensor<double> A("A", {NUM_I}, rm);
+//   Tensor<double> ref("ref", {NUM_I}, rm);
+//   IndexVar i, j, k, l, m;
+//   A(i) = B(i,j) * C(j,k) * v(k);
+
+//   // IndexStmt stmt = A.getAssignment().concretize();
+//   IndexStmt stmt = makeReductionNotation(A.getAssignment());
+//   stmt = makeConcreteNotation(stmt);
+//   printToFile("SpMVfused", stmt);
+//   stmt = reorderLoopsTopologically(stmt);
+//   stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1);
+//   stmt = insertTemporaries(stmt);
+//   stmt = parallelizeOuterLoop(stmt);
+
+//   A.compile(stmt);
+//   // We can now call the functions taco generated to assemble the indices of the
+//   // output matrix and then actually compute the MTTKRP.
+//   A.assemble();
+
+
+//   // ref(i) = B(i,j) * C(j,k) * v(k);
+//   // IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+//   // refStmt = makeConcreteNotation(refStmt);
+//   // refStmt = insertTemporaries(refStmt);
+//   // refStmt = parallelizeOuterLoop(refStmt);
+//   // ref.compile(refStmt);
+//   // ref.assemble();
+
+//   // Tensor<double> ref1({NUM_J}, rm);
+//   // Tensor<double> ref2({NUM_I}, rm);
+//   // ref1(j) = C(j,k) * v(k);
+//   // ref2(i) = B(i,j) * ref1(j);
+
+//   // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+//   // ref1Stmt = makeConcreteNotation(ref1Stmt);
+//   // ref1Stmt = insertTemporaries(ref1Stmt);
+//   // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+//   // ref1.compile(ref1Stmt);
+//   // ref1.assemble();
+
+//   // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+//   // ref2Stmt = makeConcreteNotation(ref2Stmt);
+//   // ref2Stmt = insertTemporaries(ref2Stmt);
+//   // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+//   // ref2.compile(ref2Stmt);
+//   // ref2.assemble();
+
+//   std::cout << "compute start\n";
+//   taco::util::TimeResults timevalue;
+//   bool time                = true;
+//   // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+//   TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+//   // ASSERT_TENSOR_EQ(ref, A);
+
+//   // // check results
+//   // for (int q = 0; q < A.getDimension(0); ++q) {
+//   //   if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) {
+//   //     std::cout << "error: results don't match A("<< q << "): " 
+//   //       << A(q) << ", ref: " << ref(q) << std::endl;
+//   //     ASSERT_TRUE(false);
+//   //   }
+//   // }
+//   // // ASSERT_TENSOR_EQ(A, ref);
+//   // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+//   // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+//   // ASSERT_TENSOR_EQ(ref, ref2);
+
+//   // for (int q = 0; q < ref2.getDimension(0); ++q) {
+//   //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+//   //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+//   //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+//   //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+//   //       ASSERT_TRUE(false);
+//   //     }
+//   //   }
+//   // }
+
+// }
+
+// TEST(scheduling_eval, spmvFused) {
+//   if (should_use_CUDA_codegen()) {
+//     return;
+//   }
+
+//   ofstream statfile;
+//   statfile.open(
+//     "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app);
+//   if (statfile.is_open()) {
+//     statfile << "\nspmv-spmv execution\n";
+//     statfile << "\n-----------------------------------------\n";
+//   }
+//   taco_set_num_threads(NUM_THREADS_TO_USE);
+
+//   std::default_random_engine gen(0);
+//   std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+//   Format csr({dense, sparse});
+//   Format  rm({dense});
+
+
+
+//   int filenum = 1;
+
+//   std::vector<std::string> matfiles = {
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+//     "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+//   };
+//   std::vector<std::string> matfilesrw = {
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+//     "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+//   };
+
+//   // uncomment this for reading the csr matrix saved in mtx file
+//   std::cout << "reading B mat mtx\n";
+
+
+//   int kDim = 8;
+//   float SPARSITY = .3;
+//   std::string matfile = matfiles[filenum];
+//   std::cout << "reading B mat mtx\n";
+//   Tensor<double> B = read(matfile, csr, true);
+//   B.setName("B");
+//   B.pack();
+
+//   std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+//   std::cout << "adding c mat\n";
+
+//   std::cout << "reading B mat mtx\n";
+//   Tensor<double> C = read(matfile, csr, true);
+//   C.setName("C");
+//   C.pack();
+
+
+//   Tensor<double> v("v", {C.getDimension(1)}, rm);
+//   for (int i = 0; i < v.getDimension(0); ++i) {
+//       v.insert({i}, unif(gen));
+//   }
+//   std::cout << "packing D mat\n";
+//   v.pack();
+
+//   if (statfile.is_open()) {
+//     statfile 
+//       << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl
+//       << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+//       << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+//       << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl
+//       << std::endl;
+//   }
+
+//   Tensor<double> A("A", {B.getDimension(0)}, rm);
+//   Tensor<double> ref("ref", {B.getDimension(0)}, rm);
+//   IndexVar i, j, k, l, m;
+//   A(i) = B(i,j) * C(j,k) * v(k);
+
+//   ref(i) = B(i,j) * C(j,k) * v(k);
+//   IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+//   refStmt = makeConcreteNotation(refStmt);
+//   refStmt = insertTemporaries(refStmt);
+//   refStmt = parallelizeOuterLoop(refStmt);
+//   ref.compile(refStmt);
+//   ref.assemble();
+
+//   // IndexStmt stmt = A.getAssignment().concretize();
+//   IndexStmt stmt = makeReductionNotation(A.getAssignment());
+//   stmt = makeConcreteNotation(stmt);
+//   printToFile("SpMVfused", stmt);
+//   stmt = reorderLoopsTopologically(stmt);
+//   stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1);
+//   stmt = insertTemporaries(stmt);
+//   stmt = parallelizeOuterLoop(stmt);
+//   A.compile(stmt);
+//   A.assemble();
+
+
+//   // Tensor<double> ref1({NUM_J}, rm);
+//   // Tensor<double> ref2({NUM_I}, rm);
+//   // ref1(j) = C(j,k) * v(k);
+//   // ref2(i) = B(i,j) * ref1(j);
+
+//   // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+//   // ref1Stmt = makeConcreteNotation(ref1Stmt);
+//   // ref1Stmt = insertTemporaries(ref1Stmt);
+//   // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+//   // ref1.compile(ref1Stmt);
+//   // ref1.assemble();
+
+//   // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+//   // ref2Stmt = makeConcreteNotation(ref2Stmt);
+//   // ref2Stmt = insertTemporaries(ref2Stmt);
+//   // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+//   // ref2.compile(ref2Stmt);
+//   // ref2.assemble();
+
+//   std::cout << "compute start\n";
+//   taco::util::TimeResults timevalue;
+//   bool time                = true;
+//     std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so";
+
+//   TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue);
+
+  
+//   std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl;
+//   // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue);
+//   // ASSERT_TENSOR_EQ(ref, A);
+
+//   // // check results
+//   // for (int q = 0; q < A.getDimension(0); ++q) {
+//   //   if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) {
+//   //     std::cout << "error: results don't match A("<< q << "): " 
+//   //       << A(q) << ", ref: " << ref(q) << std::endl;
+//   //     ASSERT_TRUE(false);
+//   //   }
+//   // }
+//   // // ASSERT_TENSOR_EQ(A, ref);
+//   // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+//   // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+//   // ASSERT_TENSOR_EQ(ref, ref2);
+
+//   // for (int q = 0; q < ref2.getDimension(0); ++q) {
+//   //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+//   //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+//   //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+//   //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+//   //       ASSERT_TRUE(false);
+//   //     }
+//   //   }
+//   // }
+
+//   if (statfile.is_open()) {
+//     statfile.close();
+//   }
+
+// }
+
+TEST(scheduling_eval, sddmmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense, dense});
+  int ldim = 4;
+  int kdim = 8;
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  float SPARSITY = .3;
+  Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B);
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l;
+  A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMMConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+
+  // check results
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+  // ASSERT_TENSOR_EQ(A, ref);
+  TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+
+  for (int q = 0; q < ref2.getDimension(0); ++q) {
+    for (int w = 0; w < ref2.getDimension(1); ++w) {
+      if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+
+}
+
+
+IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+}
+
+TEST(scheduling_eval, sddmmFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int ldim = 128;
+  int kdim = 128;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+  // int filenum = 5;
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt
+    .split(i, i0, i1, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = refStmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k, l});
+  stmt = insertTemporaries(stmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+
+  IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here
+  
+  ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+          // .pos(j, jpos, B(i,j));
+          // .split(k, k0, k1, 8);
+          // .reorder({i0, i1, jpos0, k, jpos1});
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // ref1Stmt.split(i, );
+  // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so";
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  statfile << "\nseparate execution\n";
+  
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "spmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  statfile << "\nreference execution \n";
+
+  std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+  // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+  //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+  //       << "refvals: " << ref_vals[q] << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+
+  for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+  for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref2_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+  // // for (int q= 0; q< A_vals
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   for (int w = 0; w < A.getDimension(1); ++w) {
+  //     if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+  // ASSERT_TENSOR_EQ(A, ref);
+
+  } // end of for loop
+
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+}
+
+
+
+
+TEST(scheduling_eval, hadamardFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int kdim = 128;
+  int ldim = 128;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+  // int filenum = 15;
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({kdim, ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l);
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = stmt.reorder({i, j, k, l});
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt
+    .split(i, i0, i1, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k, l});
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), kdim}, rm);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  ref1(i,k)=B(i,j)*C(j,k)*D(j,k);
+  ref2(i,l)=ref1(i,k)*F(k,l);
+
+  // IndexStmt ref1Stmt = ref1.getAssignment().concretize();
+  
+  // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+  //         // .pos(j, jpos, B(i,j));
+  //         // .split(k, k0, k1, 8);
+  //         // .reorder({i0, i1, jpos0, k, jpos1});
+  //         // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+  //         // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // // ref1Stmt.split(i, );
+  // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = ref1Stmt
+    .split(i, i0, i1, 16)
+    .reorder({i0, i1, j, k});
+    // .pos(j, jpos, B(i,j))
+    // .split(jpos, jpos0, jpos1, 32)
+    // .split(k, k0, k1, 32)
+    // .reorder({i0, i1, jpos0, k0, jpos1, k1});
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = ref2Stmt
+    .split(i, i0, i1, 32)
+    .split(k, k0, k1, 32)
+    .split(l, l0, l1, 32)
+    .reorder({i0, k0, l0, i1, k1, l1});
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "hadamard time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue);
+  // if (statfile.is_open()) {
+  //   statfile << "sddmm time: ";
+  //   statfile << timevalue.mean << std::endl;
+  // } else { std::cout << " stat file is not open\n"; }
+  
+  // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+  // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref2_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  } // end of for loop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+
+
+
+
+
+TEST(scheduling_eval, mttkrpFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or 
+  // sparse and (optionally) the order in which dimensions should be stored. The 
+  // formats declared below correspond to compressed sparse fiber (csf) and 
+  // row-major dense (rm).
+  Format csf({Sparse,Sparse,Sparse});
+  Format rm({Dense,Dense});
+  Format sd({Dense,Dense});
+
+  int NUM_I = 1021/20;
+  int NUM_J = 1039/20;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  int NUM_M = 1231/20;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_M}, sd);
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, csf);
+  Tensor<double> C("C", {NUM_K, NUM_J}, rm);
+  Tensor<double> D("D", {NUM_L, NUM_J}, rm);
+  Tensor<double> E("E", {NUM_J, NUM_M}, rm);
+  Tensor<double> ref({NUM_I, NUM_M}, sd);
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+  C.pack();
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+  D.pack();
+
+  for (int i = 0; i < E.getDimension(0); ++i) {
+    for (int j = 0; j < E.getDimension(1); ++j) {
+      E.insert({i,j}, unif(gen));
+    }
+  }
+  E.pack();
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, k, l, j, m;
+  A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMTTKRPConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMTTKRPOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMTTKRPFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMTTKRPWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMTTKRPFusedPar", stmt);
+
+  
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input tensor and matrices but have not actually
+  // performed the computation yet. To do so, we must first tell taco to generate
+  // code that can be executed to compute the MTTKRP operation.
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();  
+
+  // Tensor<double> ref2({NUM_I, NUM_J}, sd);
+  // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble(); 
+
+  // Tensor<double> ref3({NUM_I, NUM_M}, sd);
+  // ref3(i,m) = ref2(i,j) * E(j,m);
+  // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  // ref3Stmt = makeConcreteNotation(ref3Stmt);
+  // ref3Stmt = insertTemporaries(ref3Stmt);
+  // ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  // ref3.compile(ref3Stmt);
+  // ref3.assemble();  
+  
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue);
+  ASSERT_TENSOR_EQ(ref, A);
+  // ASSERT_TENSOR_EQ(ref, ref3);
+
+}
+
+
+TEST(scheduling_eval, mttkrpFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nmttkrp-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or 
+  // sparse and (optionally) the order in which dimensions should be stored. The 
+  // formats declared below correspond to compressed sparse fiber (csf) and 
+  // row-major dense (rm).
+  Format csf({Dense,Sparse,Sparse});
+  Format rm({Dense,Dense});
+  Format sd({Dense,Dense});
+  int jDim = 32;
+  int mDim = 64;
+
+  int matfilenum = 3;
+
+  // Load a sparse order-3 tensor from file (stored in the FROSTT format) and 
+  // store it as a compressed sparse fiber tensor. The tensor in this example 
+  // can be download from: http://frostt.io/tensors/nell-2/
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", 
+    "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2
+    "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3
+    "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4
+    "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6
+    "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", //  3
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", //   4
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns",  // 6
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns"
+  };
+  std::string matfile = matfiles[matfilenum];
+  Tensor<double> B = read(matfile, csf, true);
+  // write(matfilesrw[matfilenum], B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  Tensor<double> C({B.getDimension(1), jDim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in row-major format.
+  Tensor<double> D({B.getDimension(2), jDim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  Tensor<double> E({jDim, mDim}, rm);
+  for (int i = 0; i < E.getDimension(0); ++i) {
+    for (int j = 0; j < E.getDimension(1); ++j) {
+      E.insert({i,j}, unif(gen));
+    }
+  }
+  E.pack();
+
+  if (statfile.is_open()) {
+    statfile 
+      << matfile << std::endl
+      << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+    // Declare the output matrix to be a dense matrix with 25 columns and the same
+  // number of rows as the number of slices along the first dimension of input
+  // tensor B, to be also stored as a row-major dense matrix.
+  Tensor<double> A({B.getDimension(0), mDim}, sd);
+  Tensor<double> ref({B.getDimension(0), mDim}, sd);
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, k, l, j, m;
+  IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2");
+
+  A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  // stmt = stmt.reorder({i,j,k,l,m});
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  stmt = stmt.split(i, i1, i2, 16);
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMTTKRPFusedPar", stmt);
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt
+    .split(i, i1, i2, 16);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref2({B.getDimension(0), jDim}, sd);
+  ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = ref2Stmt
+    .split(i, i1, i2, 16);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble(); 
+
+  Tensor<double> ref2_ryan({B.getDimension(0), jDim}, sd);
+  ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j);
+
+  IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment());
+  ref2RyanStmt = makeConcreteNotation(ref2RyanStmt);
+  
+  IndexExpr precomputeExpr = ref2RyanStmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w)
+          .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+  ref2RyanStmt = insertTemporaries(ref2RyanStmt);
+  // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt);
+  ref2_ryan.compile(ref2RyanStmt);
+  ref2_ryan.assemble(); 
+
+  Tensor<double> ref3({B.getDimension(0), mDim}, sd);
+  ref3(i,m) = ref2(i,j) * E(j,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = ref3Stmt
+    .split(i, i1, i2, 16)
+    .split(j, j1, j2, 16)
+    .split(m, m1, m2, 16)
+    .reorder({i1, j1, m1, i2, j2, m2})
+    .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble(); 
+
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "default mttkrp time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "ryan mttkrp workspace time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+  double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* jDim; q++) {
+    if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " "
+        << "refvals: " << ref2_ryan_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "GeMM time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "reference asymptotic blowup time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* mDim; q++) {
+    if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused mttkrp+gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  for (int q=0; q < B.getDimension(0)* mDim; q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+
+}
+
+TEST(scheduling_eval, ttmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  Format csf({Sparse,Sparse,Sparse});
+  Format custom({Sparse,Sparse,Dense});
+  Format rm({Dense,Dense});
+
+  int NUM_I = 5;
+  int NUM_J = 5;
+  int NUM_K = 5;
+  int NUM_L = 64;
+  int NUM_M = 1024;
+  float SPARSITY = .1;
+
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, csf);
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+  B.pack();
+  write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B);
+
+  // Generate a random dense matrix and store it in row-major (dense) format. 
+  // Matrices correspond to order-2 tensors in taco.
+  Tensor<double> C({B.getDimension(2), NUM_L}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in row-major format.
+  Tensor<double> D({NUM_L, NUM_M}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  Tensor<double> A({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  Tensor<double> ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+
+  // Define the MTTKRP computation using index notation.
+  IndexVar i, j, k, l, m;
+  A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedTTMTTKRPConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedTTMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedTTMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedTTMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedTTMFinal", stmt);
+
+  
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input tensor and matrices but have not actually
+  // performed the computation yet. To do so, we must first tell taco to generate
+  // code that can be executed to compute the MTTKRP operation.
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  printToFile("tacoFusedTTM", refStmt);
+  ref.compile(refStmt);
+  ref.assemble(); 
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom);
+  ref1(i,j,l) = B(i,j,k) * C(k,l);
+  IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();  
+
+  Tensor<double> ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  ref2(i,j,m) = ref1(i,j,l) * D(l,m);
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble(); 
+
+  Tensor<double> ref3({B.getDimension(2), NUM_M}, rm);
+  ref3(k,m) = C(k,l) * D(l,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble();  
+
+  Tensor<double> ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom);
+  ref4(i,j,m) = B(i,j,k) * ref3(k,m);
+  IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment());
+  ref4Stmt = makeConcreteNotation(ref4Stmt);
+  ref4Stmt = insertTemporaries(ref4Stmt);
+  ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+  ref4.compile(ref4Stmt);
+  ref4.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue);
+  TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue);
+  ASSERT_TENSOR_EQ(ref, A);
+  ASSERT_TENSOR_EQ(ref, ref2);
+  ASSERT_TENSOR_EQ(ref, ref4);
+
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      for (int z = 0; z < A.getDimension(2); ++z) {
+        // std::cout << "(" << q << "," << w << "," << z << ")" 
+        //   << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl;
+        if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) {
+          std::cout << "error: results don't match A: " 
+            << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl;
+          ASSERT_TRUE(false);
+        }
+      }
+    }
+  }
+
+}
+
+TEST(scheduling_eval, ttmFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nttm-ttm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  Format csf({Dense,Sparse,Sparse});
+  Format custom({Dense,Sparse,Dense});
+  Format rm({Dense,Dense});
+  int ldim = 32;
+  int mdim = 64;
+
+  int64_t dummy_array_size = 2e6;
+  int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t));
+
+  vector<int> matfilenums = {5};
+
+  for (auto matfilenum : matfilenums) {
+
+    // int matfilenum = 0;
+
+    
+
+    // Load a sparse order-3 tensor from file (stored in the FROSTT format) and 
+    // store it as a compressed sparse fiber tensor. The tensor in this example 
+    // can be download from: http://frostt.io/tensors/nell-2/
+    std::vector<std::string> matfiles = {
+      "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2
+      "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3
+      "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4
+      "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 
+      "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6
+      "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns"
+    };
+    std::vector<std::string> matfilesrw = {
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns"
+    };
+    statfile << "\nfile: " << matfiles[matfilenum] << std::endl;
+    statfile << "----------------------------------------------------------------\n";
+
+    std::string matfile = matfiles[matfilenum];
+    Tensor<double> B = read(matfile, csf);
+    B.setName("B");
+    B.pack();
+    // write(matfilesrw[matfilenum], B);
+
+    // Generate a random dense matrix and store it in row-major (dense) format. 
+    // Matrices correspond to order-2 tensors in taco.
+    Tensor<double> C("C", {B.getDimension(2), ldim}, rm);
+    for (int i = 0; i < C.getDimension(0); ++i) {
+      for (int j = 0; j < C.getDimension(1); ++j) {
+        C.insert({i,j}, unif(gen));
+      }
+    }
+    C.pack();
+
+    // Generate another random dense matrix and store it in row-major format.
+    Tensor<double> D("D", {ldim, mdim}, rm);
+    for (int i = 0; i < D.getDimension(0); ++i) {
+      for (int j = 0; j < D.getDimension(1); ++j) {
+        D.insert({i,j}, unif(gen));
+      }
+    }
+    D.pack();
+
+    if (statfile.is_open()) {
+      statfile 
+        << matfile << std::endl
+        << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl
+        << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+        << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+        << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+        << std::endl;
+    }
+
+    Tensor<double> A({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    Tensor<double> ref({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    Tensor<double> refn({B.getDimension(0), B.getDimension(1), mdim}, custom);
+
+    // Define the MTTKRP computation using index notation.
+    IndexVar i, j, k, l, m;
+    IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1;
+    A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m);
+
+
+    IndexStmt stmt = makeReductionNotation(A.getAssignment());
+    stmt = makeConcreteNotation(stmt);
+    stmt = reorderLoopsTopologically(stmt);
+    stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+    stmt = stmt.split(i, i0, i1, 16);
+    stmt = insertTemporaries(stmt);
+    stmt = parallelizeOuterLoop(stmt); 
+    printToFile("fusedTTMFinal", stmt);
+
+    A.compile(stmt);
+    A.assemble();
+
+
+    ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO
+    IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+    refStmt = makeConcreteNotation(refStmt);
+    refStmt = refStmt
+      .split(i, i0, i1, 16);
+    refStmt = insertTemporaries(refStmt);
+    refStmt = parallelizeOuterLoop(refStmt);
+    printToFile("tacoFusedTTM", refStmt);
+    ref.compile(refStmt);
+    ref.assemble();
+
+    refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO
+    IndexStmt refnStmt = makeReductionNotation(refn.getAssignment());
+    refnStmt = makeConcreteNotation(refnStmt);
+    refnStmt = refnStmt
+      .split(i, i0, i1, 16)
+      .reorder({i0, i1, j, k, l, m});
+    refnStmt = insertTemporaries(refnStmt);
+    refnStmt = parallelizeOuterLoop(refnStmt);
+    printToFile("tacoFusedTTM", refnStmt);
+    refn.compile(refnStmt);
+    refn.assemble();
+
+    Tensor<double> ref1({B.getDimension(0), B.getDimension(1), ldim}, custom);
+    ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1
+    IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+    ref1Stmt = makeConcreteNotation(ref1Stmt);
+    // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+    ref1Stmt = insertTemporaries(ref1Stmt);
+    ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+    ref1.compile(ref1Stmt);
+    ref1.assemble();  
+
+    Tensor<double> ref2({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2
+    IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+    ref2Stmt = makeConcreteNotation(ref2Stmt);
+    // ref2Stmt = ref2Stmt.split(i, i0, i1, 16);
+    ref2Stmt = insertTemporaries(ref2Stmt);
+    ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+    ref2.compile(ref2Stmt);
+    ref2.assemble();
+
+    Tensor<double> ref3({B.getDimension(2), mdim}, rm);
+    ref3(k,m) = C(k,l) * D(l,m); // GeMM
+    IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+    ref3Stmt = makeConcreteNotation(ref3Stmt);
+    ref3Stmt = ref3Stmt
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .split(m, m0, m1, 32)
+      .reorder({k0, l0, m0, k1, l1, m1});
+    ref3Stmt = insertTemporaries(ref3Stmt);
+    ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+    ref3.compile(ref3Stmt);
+    ref3.assemble();  
+
+    Tensor<double> ref4({B.getDimension(0), B.getDimension(1), mdim}, custom);
+    ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1
+    IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment());
+    ref4Stmt = makeConcreteNotation(ref4Stmt);
+    // ref4Stmt = ref4Stmt
+    //   .split(i, i0, i1, 16);
+    //   // .split(k, k0, k1, 16)
+    //   .split(m, m0, m1, 16)
+    //   .reorder({i0, i1, j, m0, k, m1});
+    ref4Stmt = insertTemporaries(ref4Stmt);
+    ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+    ref4.compile(ref4Stmt);
+    ref4.assemble();
+
+    std::cout << "compute start\n";
+    taco::util::TimeResults timevalue;
+    bool time                = true;
+
+    int r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue);
+    std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.so";
+    TOOL_BENCHMARK_TIMER(A.compute(statfile, sofile_fused), "\n\nFused TTM->TTM: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "fused time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    statfile << "\nreference impl time \n";
+
+    std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.so";
+    TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference TTM->TTM: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "reference time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_original2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.so";
+    TOOL_BENCHMARK_TIMER(refn.compute(statfile, sofile_original2), "\n\nReference new TTM->TTM: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "reference new time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    statfile << "\nschedule 1\n";
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm11 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.so";
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_ttm11), "\n\nTTM1: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "TTM1: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.so";
+    TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_ttm2), "\n\nTTM2: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "TTM2: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    statfile << "\nschedule 2\n";
+
+    TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\ndense: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "dense: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      dummy_array_to_flush_cache[i] = r;
+    }
+
+    std::string sofile_ttm12 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.so";
+    TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_ttm12), "\n\nTTM after dense: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "TTM after dense: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    r = rand();
+    bool istrue = false;
+    for (int64_t i=0; i<dummy_array_size; i++) {
+      if (dummy_array_to_flush_cache[i] != r) {
+        istrue = true;
+      }
+    }
+    std::cout << "istrue: " << istrue << std::endl;
+
+
+    double* A_vals = (double*) (A.getTacoTensorT()->vals);
+    double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+    double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+    double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals);
+
+    // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+    // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+    // std::cout << "our fused vs taco original fused check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+    // std::cout << "taco original fused vs TTM1, TTM2 check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " "
+    //       << "refvals: " << ref2_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+    // std::cout << "taco original fused vs GeMM, TTM1 check\n";
+    // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " "
+    //       << "refvals: " << ref4_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+  } // end of forloop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+
+
+
+TEST(scheduling_eval, spmmFusedWithSyntheticData) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format  rm({dense, dense});
+  int ldim = 32;
+  int kdim = 64;
+
+  // uncomment this for reading the csr matrix saved in mtx file
+  std::cout << "reading B mat mtx\n";
+
+  int NUM_I = 128;
+  int NUM_J = 96;
+  int NUM_K = 64;
+  float SPARSITY = .3;
+  Tensor<double> B("B", {NUM_I, NUM_J}, csr);
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  B.pack();
+
+  Tensor<double> C("C", {NUM_J, NUM_K}, csr);
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+  C.pack();
+  // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B);
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> D({C.getDimension(1), ldim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  D.pack();
+
+  // Tensor<double> E({B.getDimension(1), kdim}, rm);
+  // for (int i = 0; i < D.getDimension(0); ++i) {
+  //   for (int j = 0; j < D.getDimension(1); ++j) {
+  //     D.insert({i,j}, unif(gen));
+  //   }
+  // }
+  // std::cout << "packing D mat\n";
+  // D.pack();
+
+  // Tensor<double> F({B.getDimension(1), ldim}, rm);
+  // for (int i = 0; i < F.getDimension(0); ++i) {
+  //   for (int j = 0; j < F.getDimension(1); ++j) {
+  //     F.insert({i,j}, unif(gen));
+  //   }
+  // }
+  // std::cout << "packing F mat\n";
+  // F.pack();
+
+  Tensor<double> A({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref({B.getDimension(0), ldim}, rm);
+  IndexVar i, j, k, l;
+  A(i,l)=B(i,j)*C(j,k)*D(k,l);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  printToFile("fusedMMConcrete", stmt);
+  
+  stmt = reorderLoopsTopologically(stmt);
+  printToFile("fusedMMOrdered", stmt);
+  
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+  printToFile("fusedMMFused", stmt);
+
+  stmt = insertTemporaries(stmt);
+  printToFile("fusedMMWithTemps", stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("fusedMMFusedPar", stmt);
+
+  A.compile(stmt);
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the MTTKRP.
+  A.assemble();
+
+
+  ref(i,l)=B(i,j)*C(j,k)*D(k,l);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  // Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  // Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  // ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  // ref2(i,l)=ref1(i,j)*F(j,l);
+
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  // ref1Stmt = insertTemporaries(ref1Stmt);
+  // ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  // ref1.compile(ref1Stmt);
+  // ref1.assemble();
+
+  // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  // ref2Stmt = makeConcreteNotation(ref2Stmt);
+  // ref2Stmt = insertTemporaries(ref2Stmt);
+  // ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  // ref2.compile(ref2Stmt);
+  // ref2.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue);
+  TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue);
+
+  // check results
+  for (int q = 0; q < A.getDimension(0); ++q) {
+    for (int w = 0; w < A.getDimension(1); ++w) {
+      if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+        std::cout << "error: results don't match A("<< q << "," << w << "): " 
+          << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+  }
+  // // ASSERT_TENSOR_EQ(A, ref);
+  // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue);
+  // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue);
+
+  // for (int q = 0; q < ref2.getDimension(0); ++q) {
+  //   for (int w = 0; w < ref2.getDimension(1); ++w) {
+  //     if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << ref2(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+
+}
+
+
+TEST(scheduling_eval, spmmFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nspmm-spmm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+  int kdim = 128;
+  int ldim = 64;
+
+  // vector<int> filenums = {2,3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums = {0};
+
+  for (auto filenum : filenums) {
+
+
+    statfile << "filenum: " << filenum << std::endl;
+    statfile << "---------------------------------\n";
+    // int filenum = 7;
+
+    std::vector<std::string> matfiles = {
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+      "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx",
+    };
+    std::vector<std::string> matfilesrw = {
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+      "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+      "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+    };
+
+    std::string matfile = matfiles[filenum];
+    std::cout << "reading B mat mtx\n";
+    Tensor<double> B = read(matfile, csr);
+    B.pack();
+    // write(matfilesrw[filenum], B);
+
+    if (statfile.is_open()) {
+      statfile << matfile << std::endl;
+    }
+
+    std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+    std::cout << "adding c mat\n";
+    // Tensor<double> C = read(matfiles2[filenum], csr, true);
+    // std::cout << "packing C mat\n";
+
+    std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+    std::cout << "adding c mat\n";
+    Tensor<double> C("C", {B.getDimension(1), kdim}, rm);
+    for (int i = 0; i < C.getDimension(0); ++i) {
+      for (int j = 0; j < C.getDimension(1); ++j) {
+        C.insert({i,j}, unif(gen));
+      }
+    }
+    std::cout << "packing C mat\n";
+    C.pack();
+
+    Tensor<double> D({C.getDimension(1), ldim}, rm);
+    for (int i = 0; i < D.getDimension(0); ++i) {
+      for (int j = 0; j < D.getDimension(1); ++j) {
+        D.insert({i,j}, unif(gen));
+      }
+    }
+    std::cout << "packing D mat\n";
+    D.pack();
+
+    // Tensor<double> F({B.getDimension(1), ldim}, rm);
+    // for (int i = 0; i < F.getDimension(0); ++i) {
+    //   for (int j = 0; j < F.getDimension(1); ++j) {
+    //     F.insert({i,j}, unif(gen));
+    //   }
+    // }
+    // std::cout << "packing F mat\n";
+    // F.pack();
+
+    Tensor<double> A({B.getDimension(0), ldim}, rm);
+    Tensor<double> ref({B.getDimension(0), ldim}, rm);
+    Tensor<double> refn({B.getDimension(0), ldim}, rm);
+    IndexVar i, j, k, l;
+    IndexVar i0, i1, j0, j1, k0, k1, l0, l1;
+
+    A(i,l)=B(i,j)*C(j,k)*D(k,l);
+    if (statfile.is_open()) {
+      statfile 
+        << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl
+        << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+        << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+        << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+        // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+        << std::endl;
+    }
+
+    // IndexStmt stmt = A.getAssignment().concretize();
+    IndexStmt stmt = makeReductionNotation(A.getAssignment());
+    stmt = makeConcreteNotation(stmt);
+    stmt = reorderLoopsTopologically(stmt);
+    stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1);
+    stmt = stmt.split(i, i0, i1, 16);
+    stmt = insertTemporaries(stmt);
+    stmt = parallelizeOuterLoop(stmt);
+
+    A.compile(stmt);
+    A.assemble();
+
+
+    ref(i,l)=B(i,j)*C(j,k)*D(k,l);
+    refn(i,l)=B(i,j)*C(j,k)*D(k,l);
+    // IndexStmt refStmt = ref.getAssignment().concretize();
+
+    // ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+            // .pos(j, jpos, B(i,j));
+            // .split(k, k0, k1, 8);
+            // .reorder({i0, i1, jpos0, k, jpos1});
+            // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+            // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+    IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+    refStmt = makeConcreteNotation(refStmt);
+    refStmt = refStmt
+      .split(i, i0, i1, 16)
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({i0, i1, j, k0, l0, k1, l1});
+    refStmt = insertTemporaries(refStmt);
+    refStmt = parallelizeOuterLoop(refStmt);
+    ref.compile(refStmt);
+    ref.assemble();
+
+    IndexStmt refnStmt = makeReductionNotation(refn.getAssignment());
+    refnStmt = makeConcreteNotation(refnStmt);
+    refnStmt = refnStmt
+      .split(i, i0, i1, 16);
+    refnStmt = insertTemporaries(refnStmt);
+    refnStmt = parallelizeOuterLoop(refnStmt);
+    refn.compile(refnStmt);
+    refn.assemble();
+
+    // SpMM , GEMM
+
+    Tensor<double> ref1({B.getDimension(0), kdim}, rm);
+    Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+    Tensor<double> ref2_2({B.getDimension(0), ldim}, rm);
+    
+    ref1(i,k)=B(i,j)*C(j,k);
+    ref2(i,l)=ref1(i,k)*D(k,l);
+    ref2_2(i,l)=ref1(i,k)*D(k,l);
+
+    IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+    ref1Stmt = makeConcreteNotation(ref1Stmt);
+    ref1Stmt = insertTemporaries(ref1Stmt);
+    ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+    ref1.compile(ref1Stmt);
+    ref1.assemble();
+
+    IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+    ref2Stmt = makeConcreteNotation(ref2Stmt);
+    ref2Stmt = insertTemporaries(ref2Stmt);
+    ref2Stmt = ref2Stmt.split(i, i0, i1, 16);
+    ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+    ref2.compile(ref2Stmt);
+    ref2.assemble();
+
+    IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment());
+    ref2Stmt2 = makeConcreteNotation(ref2Stmt2);
+    ref2Stmt2 = ref2Stmt2
+      .split(i, i0, i1, 32)
+      .split(k,k0,k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({i0, k0, l0, i1, k1, l1})
+      .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+    ref2Stmt2 = insertTemporaries(ref2Stmt2);
+    // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2);
+    ref2_2.compile(ref2Stmt2);
+    ref2_2.assemble();
+
+
+    // -------------- GeMM and SpMM 
+
+    Tensor<double> ref3({C.getDimension(0), ldim}, rm);
+    Tensor<double> ref4({C.getDimension(0), ldim}, rm);
+    ref3(j,l)=C(j,k)*D(k,l); // GEMM
+    ref4(i,l) = B(i,j)*ref3(j,l); // SpMM
+
+    IndexStmt ref3Stmt = ref3.getAssignment().concretize();
+    ref3Stmt = ref3Stmt
+      .split(j, j0, j1, 32) // changed to 32
+      .split(k, k0, k1, 32)
+      .split(l, l0, l1, 32)
+      .reorder({j0, k0, l0, j1, k1, l1})
+      .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+    ref2Stmt2 = insertTemporaries(ref2Stmt2);
+    ref3.compile(ref3Stmt);
+    ref3.assemble();
+    
+    IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation
+    ref4Stmt = makeConcreteNotation(ref4Stmt);
+    ref4Stmt = ref4Stmt.split(i, i0, i1, 16);
+    ref4Stmt = insertTemporaries(ref4Stmt);
+    ref4Stmt = parallelizeOuterLoop(ref4Stmt);
+    ref4.compile(ref4Stmt);
+    ref4.assemble();
+
+
+    std::cout << "compute start\n";
+    taco::util::TimeResults timevalue;
+    bool time                = true;
+
+    statfile << "\n--------- 1st pattern computation TTM, GEMM\n";
+    
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "SpMM time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; 
+    TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "SpMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+     
+    TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "GeMM time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue);   
+    if (statfile.is_open()) {
+      statfile << "ref 2 GeMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so";
+    statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n";
+    TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); 
+    if (statfile.is_open()) {
+      statfile << "ref3 GeMM template time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "SpMM template time ref4: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    statfile << "\n-------- reference pattern computation\n";
+
+    TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);     
+    if (statfile.is_open()) {
+      statfile << "taco reference time: ";
+      statfile << timevalue << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+    TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue);   
+    if (statfile.is_open()) {
+      statfile << "taco reference new time: ";
+      statfile << timevalue << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+    if (statfile.is_open()) {
+      statfile << "fused time: ";
+      statfile << timevalue.mean << std::endl;
+    } else { std::cout << " stat file is not open\n"; }
+
+
+    double* A_vals = (double*) (A.getTacoTensorT()->vals);
+    double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+    double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals);
+    double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals);
+
+    // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+    // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) {
+    //   if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+    //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+    //       << "refvals: " << ref_vals[q] << std::endl;
+    //     ASSERT_TRUE(false);
+    //   }
+    // }
+
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref2_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+    for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+      if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) {
+        std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+          << "refvals: " << ref4_vals[q] << std::endl;
+        ASSERT_TRUE(false);
+      }
+    }
+
+  } // end of file num for loop
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
+
+
+
+
+
+
+TEST(scheduling_eval, sddmmspmmFused) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  taco_set_num_threads(NUM_THREADS_TO_USE);
+
+  ofstream statfile;
+  statfile.open(
+    "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app);
+  if (statfile.is_open()) {
+    statfile << "\nsddmm-spmm-gemm execution\n";
+    statfile << "\n-----------------------------------------\n";
+  }
+
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+
+  Format csr({dense, sparse});
+  Format rm({dense, dense});
+
+  int kdim = 64;
+  int ldim = 64;
+  int mdim = 64;
+
+  // vector<int> filenums{2, 3,4,5,6,7,8,9,10,12,15};
+  vector<int> filenums{0};
+
+  for (auto filenum : filenums) {
+
+
+  std::vector<std::string> matfiles = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15
+    "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx"
+  };
+  std::vector<std::string> matfilesrw = {
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx",
+    "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx",
+    "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx"
+  };
+
+  std::string matfile = matfiles[filenum];
+  std::cout << "reading B mat mtx\n";
+  Tensor<double> B = read(matfile, csr, true);
+  B.setName("B");
+  B.pack();
+  // write(matfilesrw[filenum], B);
+
+  if (statfile.is_open()) {
+    statfile << matfile << std::endl;
+  }
+
+  std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl;
+  std::cout << "adding c mat\n";
+  Tensor<double> C({B.getDimension(0), kdim}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing C mat\n";
+  C.pack();
+
+  Tensor<double> D({B.getDimension(1), kdim}, rm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing D mat\n";
+  D.pack();
+
+  Tensor<double> F({B.getDimension(1), ldim}, rm);
+  for (int i = 0; i < F.getDimension(0); ++i) {
+    for (int j = 0; j < F.getDimension(1); ++j) {
+      F.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  F.pack();
+
+  Tensor<double> G({ldim, mdim}, rm);
+  for (int i = 0; i < G.getDimension(0); ++i) {
+    for (int j = 0; j < G.getDimension(1); ++j) {
+      G.insert({i,j}, unif(gen));
+    }
+  }
+  std::cout << "packing F mat\n";
+  G.pack();
+
+  Tensor<double> A({B.getDimension(0), mdim}, rm);
+  Tensor<double> ref({B.getDimension(0), mdim}, rm);
+  IndexVar i, j, k, l, m;
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1");
+  IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1");
+  
+  A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+  
+  if (statfile.is_open()) {
+    statfile 
+      << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl
+      << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl
+      << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl
+      << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl
+      << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl
+      << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl
+      << std::endl;
+  }
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  IndexStmt stmt = makeReductionNotation(A.getAssignment());
+  stmt = makeConcreteNotation(stmt);
+  stmt = reorderLoopsTopologically(stmt);
+  stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2);
+  stmt = stmt.split(i, i0, i1, 16);
+
+  stmt = insertTemporaries(stmt);
+  stmt = parallelizeOuterLoop(stmt); 
+  printToFile("sddmmSpMMGeMM", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+
+
+  ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);
+  IndexStmt refStmt = makeReductionNotation(ref.getAssignment());
+  refStmt = makeConcreteNotation(refStmt);
+  refStmt = refStmt.split(i, i0, i1, 16);
+  refStmt = insertTemporaries(refStmt);
+  refStmt = parallelizeOuterLoop(refStmt);
+  ref.compile(refStmt);
+  ref.assemble();
+
+  Tensor<double> ref1({B.getDimension(0), B.getDimension(1)}, csr);
+  Tensor<double> ref2({B.getDimension(0), ldim}, rm);
+  Tensor<double> ref3({B.getDimension(0), mdim}, rm);
+  ref1(i,j)=B(i,j)*C(i,k)*D(j,k);
+  ref2(i,l)=ref1(i,j)*F(j,l);
+  ref3(i,m)=ref2(i,l)*G(l,m);
+
+  IndexStmt ref1Stmt = ref1.getAssignment().concretize();
+  
+  ref1Stmt = ref1Stmt.split(i, i0, i1, 16);
+  //         // .pos(j, jpos, B(i,j));
+  //         // .split(k, k0, k1, 8);
+  //         // .reorder({i0, i1, jpos0, k, jpos1});
+  //         // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+  //         // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+  // // ref1Stmt.split(i, );
+  // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B);
+  // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment());
+  // ref1Stmt = makeConcreteNotation(ref1Stmt);
+  ref1Stmt = insertTemporaries(ref1Stmt);
+  ref1Stmt = parallelizeOuterLoop(ref1Stmt);
+  ref1.compile(ref1Stmt);
+  ref1.assemble();
+
+  IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment());
+  ref2Stmt = makeConcreteNotation(ref2Stmt);
+  ref2Stmt = insertTemporaries(ref2Stmt);
+  ref2Stmt = parallelizeOuterLoop(ref2Stmt);
+  ref2.compile(ref2Stmt);
+  ref2.assemble();
+
+  // ref3(i,m)=ref2(i,l)*G(l,m);
+  IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment());
+  ref3Stmt = makeConcreteNotation(ref3Stmt);
+  ref3Stmt = ref3Stmt
+    .split(i, i0, i1, 32)
+    .split(l, l0, l1, 32)
+    .split(m, m0, m1, 32)
+    .reorder({i0, l0, m0, i1, l1, m1});
+  ref3Stmt = insertTemporaries(ref3Stmt);
+  ref3Stmt = parallelizeOuterLoop(ref3Stmt);
+  ref3.compile(ref3Stmt);
+  ref3.assemble();
+
+  std::cout << "compute start\n";
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  
+  // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so";
+  TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "fused time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so";
+  TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "sddmm ryan time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+  
+  std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "spmm ryan time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so";
+  TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "gemm time: ";
+    statfile << timevalue.mean << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so";
+  TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue);
+  if (statfile.is_open()) {
+    statfile << "taco reference time: ";
+    statfile << timevalue << std::endl;
+  } else { std::cout << " stat file is not open\n"; }
+
+  double* A_vals = (double*) (A.getTacoTensorT()->vals);
+  double* ref_vals = (double*) (ref.getTacoTensorT()->vals);
+  double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals);
+
+  // int* A2_pos = (double*) (ref.getTacoTensorT()->vals);
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+  for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+    if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) {
+      std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " "
+        << "refvals: " << ref_vals[q] << std::endl;
+      ASSERT_TRUE(false);
+    }
+  }
+
+
+
+  }
+
+  // int filenum = 3;
+
+  
+  // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) {
+  //   if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) {
+  //     std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " "
+  //       << "refvals: " << ref3_vals[q] << std::endl;
+  //     ASSERT_TRUE(false);
+  //   }
+  // }
+  // for (int q= 0; q< A_vals
+  // for (int q = 0; q < A.getDimension(0); ++q) {
+  //   for (int w = 0; w < A.getDimension(1); ++w) {
+  //     if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) {
+  //       std::cout << "error: results don't match A("<< q << "," << w << "): " 
+  //         << A(q,w) << ", ref: " << ref(q,w) << std::endl;
+  //       ASSERT_TRUE(false);
+  //     }
+  //   }
+  // }
+  // ASSERT_TENSOR_EQ(A, ref);
+
+  if (statfile.is_open()) {
+    statfile.close();
+  }
+
+}
\ No newline at end of file
diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp
index abfec3d45..9a472906f 100644
--- a/test/tests-transformation.cpp
+++ b/test/tests-transformation.cpp
@@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply,
 
 struct reorderLoopsTopologically : public TestWithParam<NotationTest> {};
 
+
+//
 TEST_P(reorderLoopsTopologically, test) {
   IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual);
   ASSERT_NOTATION_EQ(GetParam().expected, actual);
diff --git a/test/util.h b/test/util.h
new file mode 100644
index 000000000..0f8b633e6
--- /dev/null
+++ b/test/util.h
@@ -0,0 +1,86 @@
+#ifndef __SCHEDULE_UTIL_HH__
+#define __SCHEDULE_UTIL_HH__
+
+#include <iostream>
+#include <taco/index_notation/transformations.h>
+#include <codegen/codegen_c.h>
+#include <codegen/codegen_cuda.h>
+#include <fstream>
+#include <memory>
+#include <random>
+#include "taco/cuda.h"
+#include "test.h"
+#include "test_tensors.h"
+#include "taco/tensor.h"
+#include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/transformations.h"
+#include "codegen/codegen.h"
+#include "taco/lower/lower.h"
+#include "taco/util/timers.h"
+
+using namespace taco;
+
+#define ERROR_MARGIN (1.0e-2)
+
+#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) {                  \
+    if (time) {                                                  \
+      taco::util::Timer timer;                                   \
+      timer.start();                                             \
+      CODE;                                                      \
+      timer.stop();                                              \
+      taco::util::TimeResults result = timer.getResult();        \
+      cout << NAME << " " << result << " ms" << endl;            \
+      TIMER=result;                                              \
+    }                                                            \
+    else {                                                       \
+      CODE;                                                      \
+    }                                                            \
+}
+
+#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) {                  \
+    if (time) {                                                  \
+      taco::util::Timer timer;                                   \
+      timer.start();                                             \
+      CODE;                                                      \
+      timer.stop();                                              \
+      taco::util::TimeResults result = timer.getResult();        \
+      if (statfile.is_open()) {                                  \
+        statfile << NAME << " " << result << " ms" << endl;      \
+      } else {                                                   \
+        cout << NAME << " " << result << " ms" << endl;          \
+      }                                                          \
+      TIMER=result;                                              \
+    }                                                            \
+    else {                                                       \
+      CODE;                                                      \
+    }                                                            \
+}
+
+static void printToCout(IndexStmt stmt);
+static void printToFile(string filename, IndexStmt stmt);
+
+
+static void printToCout(IndexStmt stmt) {
+  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+  ir::Stmt compute = lower(stmt, "compute", false, true);
+  codegen->compile(compute, true);
+}
+
+void printToFile(string filename, IndexStmt stmt) {
+  stringstream source;
+
+  string file_path = "eval_generated/";
+  mkdir(file_path.c_str(), 0777);
+
+  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
+  ir::Stmt compute = lower(stmt, "compute",  false, true);
+  codegen->compile(compute, true);
+
+  ofstream source_file;
+  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
+  source_file.open(file_path + filename + file_ending);
+  source_file << source.str();
+  source_file.close();
+}
+
+#endif // __SCHEDULE_UTIL_HH__
\ No newline at end of file
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 922f7e52e..41699d3fd 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES})
   get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE)
   add_executable("${TOOL}-tool" ${TOOL_SOURCE})
   target_link_libraries("${TOOL}-tool" taco)
+  target_link_libraries("${TOOL}-tool" papi)
   target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include")
   SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL})
   install(TARGETS "${TOOL}-tool" DESTINATION bin)
diff --git a/tools/taco.cpp b/tools/taco.cpp
index cd351a203..1c22fc368 100644
--- a/tools/taco.cpp
+++ b/tools/taco.cpp
@@ -9,6 +9,7 @@
 #include "taco.h"
 
 #include "taco/error.h"
+#include "taco/index_notation/index_notation.h"
 #include "taco/parser/lexer.h"
 #include "taco/parser/parser.h"
 #include "taco/parser/schedule_parser.h"
@@ -308,7 +309,9 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) {
   }
 }
 
-static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) {
+static int setSchedulingCommands(vector<vector<string>> scheduleCommands, 
+  parser::Parser& parser, IndexStmt& stmt, Assignment assignment) {
+
   auto findVar = [&stmt](string name) {
     ProvenanceGraph graph(stmt);
     for (auto v : graph.getAllIndexVars()) {
@@ -352,6 +355,16 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
       IndexVar fused(f);
       stmt = stmt.fuse(findVar(i), findVar(j), fused);
 
+    } else if (command == "loopfuse") {
+      taco_uassert(scheduleCommand.size() == 2) 
+        << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)";
+      std::string side = scheduleCommand[0];
+      taco_uassert(side == "b" || side == "f") 
+        << "first parameter must be either 'f' or 'b'";
+
+      int iters = std::stoi(scheduleCommand[1]);
+
+      stmt = loopFusionOverFission(stmt, assignment, side, iters);
     } else if (command == "split") {
       taco_uassert(scheduleCommand.size() == 4)
           << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)";
@@ -536,7 +549,8 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
         parallel_unit = ParallelUnit::CPUThread;
       } else if (unit == "CPUVector") {
         parallel_unit = ParallelUnit::CPUVector;
-      } else {
+      }
+      else {
         taco_uerror << "Parallel hardware not defined.";
         goto end;
       }
@@ -1009,9 +1023,11 @@ int main(int argc, char* argv[]) {
   }
 
   // pre-parse expression, to determine existence and order of loaded tensors
+  std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n";
   map<string,TensorBase> loadedTensors;
   TensorBase temp_tensor;
   parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42);
+  std::cout << exprStr << std::endl;
   try {
     temp_parser.parse();
     temp_tensor = temp_parser.getResultTensor();
@@ -1112,17 +1128,29 @@ int main(int argc, char* argv[]) {
   taco_set_parallel_schedule(sched, chunkSize);
   taco_set_num_threads(nthreads);
 
-  IndexStmt stmt =
-      makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  Assignment assignment = tensor.getAssignment();
+  std::cout << "tensor.getAssignment(): " << assignment << std::endl;
+
+  IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment());
+  std::cout << "reducedNotation: " << stmt2 << std::endl;
+  // IndexStmt stmt = 
+  //     makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  IndexStmt stmt = makeConcreteNotation(stmt2);
+  std::cout << "concrete index statement: " << stmt << std::endl;
   stmt = reorderLoopsTopologically(stmt);
 
+  std::cout << "topologically reordered loops statement: " << stmt << std::endl;
+
   if (setSchedule) {
-    cuda |= setSchedulingCommands(scheduleCommands, parser, stmt);
+    cuda |= setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment());
   }
   else {
+    // stmt = loopFusionOverFission(stmt, tensor.getAssignment());
     stmt = insertTemporaries(stmt);
     stmt = parallelizeOuterLoop(stmt);
   }
+  std::cout << "after setting the scheduling commands\n";
+  std::cout << stmt << std::endl;
 
   if (cuda) {
     if (!CUDA_BUILT && benchmark) {
@@ -1134,7 +1162,10 @@ int main(int argc, char* argv[]) {
     set_CUDA_codegen_enabled(false);
   }
 
+  std::cout << "running scalar promote\n" << std::endl; //
   stmt = scalarPromote(stmt);
+  std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl;
+
   if (printConcrete) {
     cout << stmt << endl;
   }